gpuimgproc module for image processing

This commit is contained in:
Vladislav Vinogradov
2013-04-17 18:14:35 +04:00
parent d569e72ad4
commit e41aea0acf
66 changed files with 889 additions and 404 deletions

View File

@@ -1,99 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
#else
namespace cv { namespace gpu { namespace cudev
{
namespace blend
{
template <typename T>
void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
}
}}}
using namespace ::cv::gpu::cudev::blend;
void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
GpuMat& result, Stream& stream)
{
CV_Assert(img1.size() == img2.size());
CV_Assert(img1.type() == img2.type());
CV_Assert(weights1.size() == img1.size());
CV_Assert(weights2.size() == img2.size());
CV_Assert(weights1.type() == CV_32F);
CV_Assert(weights2.type() == CV_32F);
const Size size = img1.size();
const int depth = img1.depth();
const int cn = img1.channels();
result.create(size, CV_MAKE_TYPE(depth, cn));
switch (depth)
{
case CV_8U:
if (cn != 4)
blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
else
blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
break;
case CV_32F:
blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
break;
default:
CV_Error(cv::Error::StsUnsupportedFormat, "bad image depth in linear blending function");
}
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,199 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
using namespace cv::gpu;
typedef unsigned char uchar;
typedef unsigned short ushort;
//////////////////////////////////////////////////////////////////////////////////
/// Bilateral filtering
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
__device__ __forceinline__ float norm_l1(const float& a) { return ::fabs(a); }
__device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
__device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
__device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
__device__ __forceinline__ float sqr(const float& a) { return a * a; }
template<typename T, typename B>
__global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= src.cols || y >= src.rows)
return;
value_type center = saturate_cast<value_type>(src(y, x));
value_type sum1 = VecTraits<value_type>::all(0);
float sum2 = 0;
int r = ksz / 2;
float r2 = (float)(r * r);
int tx = x - r + ksz;
int ty = y - r + ksz;
if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
{
for (int cy = y - r; cy < ty; ++cy)
for (int cx = x - r; cx < tx; ++cx)
{
float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
if (space2 > r2)
continue;
value_type value = saturate_cast<value_type>(src(cy, cx));
float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
sum1 = sum1 + weight * value;
sum2 = sum2 + weight;
}
}
else
{
for (int cy = y - r; cy < ty; ++cy)
for (int cx = x - r; cx < tx; ++cx)
{
float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
if (space2 > r2)
continue;
value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
sum1 = sum1 + weight * value;
sum2 = sum2 + weight;
}
}
dst(y, x) = saturate_cast<T>(sum1 / sum2);
}
template<typename T, template <typename> class B>
void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
{
dim3 block (32, 8);
dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
B<T> b(src.rows, src.cols);
float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
cudaSafeCall ( cudaGetLastError () );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template<typename T>
void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
{
typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
static caller_t funcs[] =
{
bilateral_caller<T, BrdReflect101>,
bilateral_caller<T, BrdReplicate>,
bilateral_caller<T, BrdConstant>,
bilateral_caller<T, BrdReflect>,
bilateral_caller<T, BrdWrap>,
};
funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
}
}
}}}
#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
template void cv::gpu::cudev::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
OCV_INSTANTIATE_BILATERAL_FILTER(short)
//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
OCV_INSTANTIATE_BILATERAL_FILTER(short3)
OCV_INSTANTIATE_BILATERAL_FILTER(short4)
OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
//OCV_INSTANTIATE_BILATERAL_FILTER(int)
//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
OCV_INSTANTIATE_BILATERAL_FILTER(float)
//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
OCV_INSTANTIATE_BILATERAL_FILTER(float3)
OCV_INSTANTIATE_BILATERAL_FILTER(float4)
#endif /* CUDA_DISABLER */

View File

@@ -1,121 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace blend
{
template <typename T>
__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < rows && x < cols)
{
int x_ = x / cn;
float w1 = weights1.ptr(y)[x_];
float w2 = weights2.ptr(y)[x_];
T p1 = img1.ptr(y)[x];
T p2 = img2.ptr(y)[x];
result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
}
}
template <typename T>
void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < rows && x < cols)
{
float w1 = weights1.ptr(y)[x];
float w2 = weights2.ptr(y)[x];
float sum_inv = 1.f / (w1 + w2 + 1e-5f);
w1 *= sum_inv;
w2 *= sum_inv;
uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
}
}
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
} // namespace blend
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,494 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include <utility>
#include <algorithm>//std::swap
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/utility.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace canny
{
struct L1 : binary_function<int, int, float>
{
__device__ __forceinline__ float operator ()(int x, int y) const
{
return ::abs(x) + ::abs(y);
}
__device__ __forceinline__ L1() {}
__device__ __forceinline__ L1(const L1&) {}
};
struct L2 : binary_function<int, int, float>
{
__device__ __forceinline__ float operator ()(int x, int y) const
{
return ::sqrtf(x * x + y * y);
}
__device__ __forceinline__ L2() {}
__device__ __forceinline__ L2(const L2&) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
{
enum { smart_shift = 4 };
};
template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
{
enum { smart_shift = 4 };
};
}}}
namespace canny
{
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
struct SrcTex
{
const int xoff;
const int yoff;
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
__device__ __forceinline__ int operator ()(int y, int x) const
{
return tex2D(tex_src, x + xoff, y + yoff);
}
};
template <class Norm> __global__
void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y >= mag.rows || x >= mag.cols)
return;
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
dx(y, x) = dxVal;
dy(y, x) = dyVal;
mag(y, x) = norm(dxVal, dyVal);
}
void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
{
const dim3 block(16, 16);
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
bindTexture(&tex_src, srcWhole);
SrcTex src(xoff, yoff);
if (L2Grad)
{
L2 norm;
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
}
else
{
L1 norm;
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
{
if (L2Grad)
{
L2 norm;
transform(dx, dy, mag, norm, WithOutMask(), 0);
}
else
{
L1 norm;
transform(dx, dy, mag, norm, WithOutMask(), 0);
}
}
}
//////////////////////////////////////////////////////////////////////////////////////////
namespace canny
{
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
{
const int CANNY_SHIFT = 15;
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
return;
int dxVal = dx(y, x);
int dyVal = dy(y, x);
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
const float m = tex2D(tex_mag, x, y);
dxVal = ::abs(dxVal);
dyVal = ::abs(dyVal);
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if (m > low_thresh)
{
const int tg22x = dxVal * TG22;
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
dyVal <<= CANNY_SHIFT;
if (dyVal < tg22x)
{
if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
edge_type = 1 + (int)(m > high_thresh);
}
else if(dyVal > tg67x)
{
if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
else
{
if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
edge_type = 1 + (int)(m > high_thresh);
}
}
map(y, x) = edge_type;
}
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
bindTexture(&tex_mag, mag);
calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
//////////////////////////////////////////////////////////////////////////////////////////
namespace canny
{
__device__ int counter = 0;
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
{
__shared__ volatile int smem[18][18];
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
if (threadIdx.y == 0)
smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
if (threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
if (threadIdx.x == 0)
smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1)
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == 0)
smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
__syncthreads();
if (x >= map.cols || y >= map.rows)
return;
int n;
#pragma unroll
for (int k = 0; k < 16; ++k)
{
n = 0;
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
{
n += smem[threadIdx.y ][threadIdx.x ] == 2;
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
}
if (n > 0)
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
}
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
map(y, x) = e;
n = 0;
if (e == 2)
{
n += smem[threadIdx.y ][threadIdx.x ] == 1;
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
}
if (n > 0)
{
const int ind = ::atomicAdd(&counter, 1);
st[ind] = make_ushort2(x, y);
}
}
void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
const dim3 block(16, 16);
const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
//////////////////////////////////////////////////////////////////////////////////////////
namespace canny
{
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
{
const int stack_size = 512;
__shared__ int s_counter;
__shared__ int s_ind;
__shared__ ushort2 s_st[stack_size];
if (threadIdx.x == 0)
s_counter = 0;
__syncthreads();
int ind = blockIdx.y * gridDim.x + blockIdx.x;
if (ind >= count)
return;
ushort2 pos = st1[ind];
if (threadIdx.x < 8)
{
pos.x += c_dx[threadIdx.x];
pos.y += c_dy[threadIdx.x];
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
{
map(pos.y, pos.x) = 2;
ind = Emulation::smem::atomicAdd(&s_counter, 1);
s_st[ind] = pos;
}
}
__syncthreads();
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
{
const int subTaskIdx = threadIdx.x >> 3;
const int portion = ::min(s_counter, blockDim.x >> 3);
if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx];
__syncthreads();
if (threadIdx.x == 0)
s_counter -= portion;
__syncthreads();
if (subTaskIdx < portion)
{
pos.x += c_dx[threadIdx.x & 7];
pos.y += c_dy[threadIdx.x & 7];
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
{
map(pos.y, pos.x) = 2;
ind = Emulation::smem::atomicAdd(&s_counter, 1);
s_st[ind] = pos;
}
}
__syncthreads();
}
if (s_counter > 0)
{
if (threadIdx.x == 0)
{
ind = ::atomicAdd(&counter, s_counter);
s_ind = ind - s_counter;
}
__syncthreads();
ind = s_ind;
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
st2[ind + i] = s_st[i];
}
}
void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
while (count > 0)
{
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
const dim3 block(128);
const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
std::swap(st1, st2);
}
}
}
//////////////////////////////////////////////////////////////////////////////////////////
namespace canny
{
struct GetEdges : unary_function<int, uchar>
{
__device__ __forceinline__ uchar operator ()(int e) const
{
return (uchar)(-(e >> 1));
}
__device__ __forceinline__ GetEdges() {}
__device__ __forceinline__ GetEdges(const GetEdges&) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
{
enum { smart_shift = 4 };
};
}}}
namespace canny
{
void getEdges(PtrStepSzi map, PtrStepSzb dst)
{
transform(map, dst, GetEdges(), WithOutMask(), 0);
}
}
#endif /* CUDA_DISABLER */

View File

@@ -1,534 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include <opencv2/core/cuda/common.hpp>
#include <opencv2/core/cuda/vec_traits.hpp>
#include <opencv2/core/cuda/vec_math.hpp>
#include <opencv2/core/cuda/emulation.hpp>
#include <iostream>
#include <stdio.h>
namespace cv { namespace gpu { namespace cudev
{
namespace ccl
{
enum
{
WARP_SIZE = 32,
WARP_LOG = 5,
CTA_SIZE_X = 32,
CTA_SIZE_Y = 8,
STA_SIZE_MERGE_Y = 4,
STA_SIZE_MERGE_X = 32,
TPB_X = 1,
TPB_Y = 4,
TILE_COLS = CTA_SIZE_X * TPB_X,
TILE_ROWS = CTA_SIZE_Y * TPB_Y
};
template<typename T> struct IntervalsTraits
{
typedef T elem_type;
};
template<> struct IntervalsTraits<unsigned char>
{
typedef int dist_type;
enum {ch = 1};
};
template<> struct IntervalsTraits<uchar3>
{
typedef int3 dist_type;
enum {ch = 3};
};
template<> struct IntervalsTraits<uchar4>
{
typedef int4 dist_type;
enum {ch = 4};
};
template<> struct IntervalsTraits<unsigned short>
{
typedef int dist_type;
enum {ch = 1};
};
template<> struct IntervalsTraits<ushort3>
{
typedef int3 dist_type;
enum {ch = 3};
};
template<> struct IntervalsTraits<ushort4>
{
typedef int4 dist_type;
enum {ch = 4};
};
template<> struct IntervalsTraits<float>
{
typedef float dist_type;
enum {ch = 1};
};
template<> struct IntervalsTraits<int>
{
typedef int dist_type;
enum {ch = 1};
};
typedef unsigned char component;
enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
template<typename T, int CH> struct InInterval {};
template<typename T> struct InInterval<T, 1>
{
typedef typename VecTraits<T>::elem_type E;
__host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) {};
T lo, hi;
template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
{
I d = a - b;
return lo <= d && d <= hi;
}
};
template<typename T> struct InInterval<T, 3>
{
typedef typename VecTraits<T>::elem_type E;
__host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
: lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)){};
T lo, hi;
template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
{
I d = a - b;
return lo.x <= d.x && d.x <= hi.x &&
lo.y <= d.y && d.y <= hi.y &&
lo.z <= d.z && d.z <= hi.z;
}
};
template<typename T> struct InInterval<T, 4>
{
typedef typename VecTraits<T>::elem_type E;
__host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
: lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)){};
T lo, hi;
template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
{
I d = a - b;
return lo.x <= d.x && d.x <= hi.x &&
lo.y <= d.y && d.y <= hi.y &&
lo.z <= d.z && d.z <= hi.z &&
lo.w <= d.w && d.w <= hi.w;
}
};
template<typename T, typename F>
__global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x >= image.cols || y >= image.rows) return;
T intensity = image(y, x);
component c = 0;
if ( x > 0 && connected(intensity, image(y, x - 1)))
c |= LEFT;
if ( y > 0 && connected(intensity, image(y - 1, x)))
c |= UP;
if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
c |= RIGHT;
if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
c |= DOWN;
components(y, x) = c;
}
template< typename T>
void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
{
dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
Int_t inInt(lo, hi);
computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void computeEdges<uchar> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
template void computeEdges<int> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
template void computeEdges<float> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
__global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
{
int x = threadIdx.x + blockIdx.x * TILE_COLS;
int y = threadIdx.y + blockIdx.y * TILE_ROWS;
if (x >= edges.cols || y >= edges.rows) return;
//currently x is 1
int bounds = ((y + TPB_Y) < edges.rows);
__shared__ int labelsTile[TILE_ROWS][TILE_COLS];
__shared__ int edgesTile[TILE_ROWS][TILE_COLS];
int new_labels[TPB_Y][TPB_X];
int old_labels[TPB_Y][TPB_X];
#pragma unroll
for (int i = 0; i < TPB_Y; ++i)
#pragma unroll
for (int j = 0; j < TPB_X; ++j)
{
int yloc = threadIdx.y + CTA_SIZE_Y * i;
int xloc = threadIdx.x + CTA_SIZE_X * j;
component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
if (!xloc) c &= ~LEFT;
if (!yloc) c &= ~UP;
if (xloc == TILE_COLS -1) c &= ~RIGHT;
if (yloc == TILE_ROWS -1) c &= ~DOWN;
new_labels[i][j] = yloc * TILE_COLS + xloc;
edgesTile[yloc][xloc] = c;
}
for (int k = 0; ;++k)
{
//1. backup
#pragma unroll
for (int i = 0; i < TPB_Y; ++i)
#pragma unroll
for (int j = 0; j < TPB_X; ++j)
{
int yloc = threadIdx.y + CTA_SIZE_Y * i;
int xloc = threadIdx.x + CTA_SIZE_X * j;
old_labels[i][j] = new_labels[i][j];
labelsTile[yloc][xloc] = new_labels[i][j];
}
__syncthreads();
//2. compare local arrays
#pragma unroll
for (int i = 0; i < TPB_Y; ++i)
#pragma unroll
for (int j = 0; j < TPB_X; ++j)
{
int yloc = threadIdx.y + CTA_SIZE_Y * i;
int xloc = threadIdx.x + CTA_SIZE_X * j;
component c = edgesTile[yloc][xloc];
int label = new_labels[i][j];
if (c & UP)
label = ::min(label, labelsTile[yloc - 1][xloc]);
if (c & DOWN)
label = ::min(label, labelsTile[yloc + 1][xloc]);
if (c & LEFT)
label = ::min(label, labelsTile[yloc][xloc - 1]);
if (c & RIGHT)
label = ::min(label, labelsTile[yloc][xloc + 1]);
new_labels[i][j] = label;
}
__syncthreads();
//3. determine: Is any value changed?
int changed = 0;
#pragma unroll
for (int i = 0; i < TPB_Y; ++i)
#pragma unroll
for (int j = 0; j < TPB_X; ++j)
{
if (new_labels[i][j] < old_labels[i][j])
{
changed = 1;
Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
}
}
changed = Emulation::syncthreadsOr(changed);
if (!changed)
break;
//4. Compact paths
const int *labels = &labelsTile[0][0];
#pragma unroll
for (int i = 0; i < TPB_Y; ++i)
#pragma unroll
for (int j = 0; j < TPB_X; ++j)
{
int label = new_labels[i][j];
while( labels[label] < label ) label = labels[label];
new_labels[i][j] = label;
}
__syncthreads();
}
#pragma unroll
for (int i = 0; i < TPB_Y; ++i)
#pragma unroll
for (int j = 0; j < TPB_X; ++j)
{
int label = new_labels[i][j];
int yloc = label / TILE_COLS;
int xloc = label - yloc * TILE_COLS;
xloc += blockIdx.x * TILE_COLS;
yloc += blockIdx.y * TILE_ROWS;
label = yloc * edges.cols + xloc;
// do it for x too.
if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
}
}
__device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
{
while(1)
{
int y = label / comps.cols;
int x = label - y * comps.cols;
int parent = comps(y, x);
if (label == parent) break;
label = parent;
}
return label;
}
__device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
{
int r1 = root(comps, l1);
int r2 = root(comps, l2);
if (r1 == r2) return;
int mi = ::min(r1, r2);
int ma = ::max(r1, r2);
int y = ma / comps.cols;
int x = ma - y * comps.cols;
atomicMin(&comps.ptr(y)[x], mi);
changed = true;
}
__global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
{
int tid = threadIdx.y * blockDim.x + threadIdx.x;
int stride = blockDim.y * blockDim.x;
int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
int yend = ybegin + tilesNumY * tileSizeY;
if (blockIdx.y == gridDim.y - 1)
{
yend -= yIncomplete * tileSizeY;
yend -= tileSizeY;
tileSizeY = (edges.rows % tileSizeY);
yend += tileSizeY;
}
int xbegin = blockIdx.x * tilesNumX * tileSizeX;
int xend = xbegin + tilesNumX * tileSizeX;
if (blockIdx.x == gridDim.x - 1)
{
if (xIncomplete) yend = ybegin;
xend -= xIncomplete * tileSizeX;
xend -= tileSizeX;
tileSizeX = (edges.cols % tileSizeX);
xend += tileSizeX;
}
if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
{
xend = xbegin;
}
int tasksV = (tilesNumX - 1) * (yend - ybegin);
int tasksH = (tilesNumY - 1) * (xend - xbegin);
int total = tasksH + tasksV;
bool changed;
do
{
changed = false;
for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
{
if (taskIdx < tasksH)
{
int indexH = taskIdx;
int row = indexH / (xend - xbegin);
int col = indexH - row * (xend - xbegin);
int y = ybegin + (row + 1) * tileSizeY;
int x = xbegin + col;
component e = edges( x, y);
if (e & UP)
{
int lc = comps(y,x);
int lu = comps(y - 1, x);
isConnected(comps, lc, lu, changed);
}
}
else
{
int indexV = taskIdx - tasksH;
int col = indexV / (yend - ybegin);
int row = indexV - col * (yend - ybegin);
int x = xbegin + (col + 1) * tileSizeX;
int y = ybegin + row;
component e = edges(x, y);
if (e & LEFT)
{
int lc = comps(y, x);
int ll = comps(y, x - 1);
isConnected(comps, lc, ll, changed);
}
}
}
} while (Emulation::syncthreadsOr(changed));
}
__global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if( x < comps.cols && y < comps.rows)
comps(y, x) = root(comps, comps(y, x));
}
enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
{
(void) flags;
dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
lableTiles<<<grid, block, 0, stream>>>(edges, comps);
cudaSafeCall( cudaGetLastError() );
int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
while (grid.x > 1 || grid.y > 1)
{
dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
// debug log
// std::cout << "merging: " << grid.y << " x " << grid.x << " ---> " << mergeGrid.y << " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
tileSizeX <<= 1;
tileSizeY <<= 1;
grid = mergeGrid;
cudaSafeCall( cudaGetLastError() );
}
grid.x = divUp(edges.cols, block.x);
grid.y = divUp(edges.rows, block.y);
flatten<<<grid, block, 0, stream>>>(edges, comps);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
} } }
#endif /* CUDA_DISABLER */

View File

@@ -1,186 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/scan.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace clahe
{
__global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
const int2 tileSize, const int tilesX,
const int clipLimit, const float lutScale)
{
__shared__ int smem[512];
const int tx = blockIdx.x;
const int ty = blockIdx.y;
const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
smem[tid] = 0;
__syncthreads();
for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
{
const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
{
const int data = srcPtr[j];
Emulation::smem::atomicAdd(&smem[data], 1);
}
}
__syncthreads();
int tHistVal = smem[tid];
__syncthreads();
if (clipLimit > 0)
{
// clip histogram bar
int clipped = 0;
if (tHistVal > clipLimit)
{
clipped = tHistVal - clipLimit;
tHistVal = clipLimit;
}
// find number of overall clipped samples
reduce<256>(smem, clipped, tid, plus<int>());
// broadcast evaluated value
__shared__ int totalClipped;
if (tid == 0)
totalClipped = clipped;
__syncthreads();
// redistribute clipped samples evenly
int redistBatch = totalClipped / 256;
tHistVal += redistBatch;
int residual = totalClipped - redistBatch * 256;
if (tid < residual)
++tHistVal;
}
const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
}
void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(tilesX, tilesY);
calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= src.cols || y >= src.rows)
return;
const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
int ty1 = __float2int_rd(tyf);
int ty2 = ty1 + 1;
const float ya = tyf - ty1;
ty1 = ::max(ty1, 0);
ty2 = ::min(ty2, tilesY - 1);
const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
int tx1 = __float2int_rd(txf);
int tx2 = tx1 + 1;
const float xa = txf - tx1;
tx1 = ::max(tx1, 0);
tx2 = ::min(tx2, tilesX - 1);
const int srcVal = src(y, x);
float res = 0;
res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
dst(y, x) = saturate_cast<uchar>(res);
}
void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) );
tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
#endif // CUDA_DISABLER

View File

@@ -1,461 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/color.hpp"
#include "cvt_color_internal.h"
namespace cv { namespace gpu { namespace cudev
{
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_x = 8 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
{
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
{
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
{ \
traits::functor_type functor = traits::create_functor(); \
typedef typename traits::functor_type::argument_type src_t; \
typedef typename traits::functor_type::result_type dst_t; \
cv::gpu::cudev::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
}
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,544 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/color.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct Bayer2BGR;
template <> struct Bayer2BGR<uchar>
{
uchar3 res0;
uchar3 res1;
uchar3 res2;
uchar3 res3;
__device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
{
uchar4 patch[3][3];
patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
if ((s_y & 1) ^ start_with_green)
{
const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
if ((s_y & 1) ^ blue_last)
{
res0.x = t1;
res0.y = patch[1][1].x;
res0.z = t0;
res1.x = patch[1][1].y;
res1.y = t3;
res1.z = t2;
res2.x = t5;
res2.y = patch[1][1].z;
res2.z = t4;
res3.x = patch[1][1].w;
res3.y = t7;
res3.z = t6;
}
else
{
res0.x = t0;
res0.y = patch[1][1].x;
res0.z = t1;
res1.x = t2;
res1.y = t3;
res1.z = patch[1][1].y;
res2.x = t4;
res2.y = patch[1][1].z;
res2.z = t5;
res3.x = t6;
res3.y = t7;
res3.z = patch[1][1].w;
}
}
else
{
const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
if ((s_y & 1) ^ blue_last)
{
res0.x = patch[1][1].x;
res0.y = t1;
res0.z = t0;
res1.x = t3;
res1.y = patch[1][1].y;
res1.z = t2;
res2.x = patch[1][1].z;
res2.y = t5;
res2.z = t4;
res3.x = t7;
res3.y = patch[1][1].w;
res3.z = t6;
}
else
{
res0.x = t0;
res0.y = t1;
res0.z = patch[1][1].x;
res1.x = t2;
res1.y = patch[1][1].y;
res1.z = t3;
res2.x = t4;
res2.y = t5;
res2.z = patch[1][1].z;
res3.x = t6;
res3.y = patch[1][1].w;
res3.z = t7;
}
}
}
};
template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
{
typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
return f(pix);
}
template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
{
return pix;
}
template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
{
return make_uchar4(pix.x, pix.y, pix.z, 255);
}
template <typename D>
__global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
{
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
if (s_y >= src.rows || (s_x << 2) >= src.cols)
return;
s_y = ::min(::max(s_y, 1), src.rows - 2);
Bayer2BGR<uchar> bayer;
bayer.apply(src, s_x, s_y, blue_last, start_with_green);
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
dst(d_y, d_x) = toDst<D>(bayer.res0);
if (d_x + 1 < src.cols)
dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
if (d_x + 2 < src.cols)
dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
if (d_x + 3 < src.cols)
dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
}
template <> struct Bayer2BGR<ushort>
{
ushort3 res0;
ushort3 res1;
__device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
{
ushort2 patch[3][3];
patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
if ((s_y & 1) ^ start_with_green)
{
const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
if ((s_y & 1) ^ blue_last)
{
res0.x = t1;
res0.y = patch[1][1].x;
res0.z = t0;
res1.x = patch[1][1].y;
res1.y = t3;
res1.z = t2;
}
else
{
res0.x = t0;
res0.y = patch[1][1].x;
res0.z = t1;
res1.x = t2;
res1.y = t3;
res1.z = patch[1][1].y;
}
}
else
{
const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
if ((s_y & 1) ^ blue_last)
{
res0.x = patch[1][1].x;
res0.y = t1;
res0.z = t0;
res1.x = t3;
res1.y = patch[1][1].y;
res1.z = t2;
}
else
{
res0.x = t0;
res0.y = t1;
res0.z = patch[1][1].x;
res1.x = t2;
res1.y = patch[1][1].y;
res1.z = t3;
}
}
}
};
template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
{
typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
return f(pix);
}
template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
{
return pix;
}
template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
{
return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
}
template <typename D>
__global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
{
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
if (s_y >= src.rows || (s_x << 1) >= src.cols)
return;
s_y = ::min(::max(s_y, 1), src.rows - 2);
Bayer2BGR<ushort> bayer;
bayer.apply(src, s_x, s_y, blue_last, start_with_green);
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
dst(d_y, d_x) = toDst<D>(bayer.res0);
if (d_x + 1 < src.cols)
dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
}
template <int cn>
void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
{
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int cn>
void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
{
typedef typename TypeVec<ushort, cn>::vec_type dst_t;
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
//////////////////////////////////////////////////////////////
// Bayer Demosaicing (Malvar, He, and Cutler)
//
// by Morgan McGuire, Williams College
// http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
//
// ported to CUDA
texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
template <typename DstType>
__global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
{
const float kAx = -1.0f / 8.0f, kAy = -1.5f / 8.0f, kAz = 0.5f / 8.0f /*kAw = -1.0f / 8.0f*/;
const float kBx = 2.0f / 8.0f, /*kBy = 0.0f / 8.0f,*/ /*kBz = 0.0f / 8.0f,*/ kBw = 4.0f / 8.0f ;
const float kCx = 4.0f / 8.0f, kCy = 6.0f / 8.0f, kCz = 5.0f / 8.0f /*kCw = 5.0f / 8.0f*/;
const float /*kDx = 0.0f / 8.0f,*/ kDy = 2.0f / 8.0f, kDz = -1.0f / 8.0f /*kDw = -1.0f / 8.0f*/;
const float kEx = -1.0f / 8.0f, kEy = -1.5f / 8.0f, /*kEz = -1.0f / 8.0f,*/ kEw = 0.5f / 8.0f ;
const float kFx = 2.0f / 8.0f, /*kFy = 0.0f / 8.0f,*/ kFz = 4.0f / 8.0f /*kFw = 0.0f / 8.0f*/;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
return;
int2 center;
center.x = x + sourceOffset.x;
center.y = y + sourceOffset.y;
int4 xCoord;
xCoord.x = center.x - 2;
xCoord.y = center.x - 1;
xCoord.z = center.x + 1;
xCoord.w = center.x + 2;
int4 yCoord;
yCoord.x = center.y - 2;
yCoord.y = center.y - 1;
yCoord.z = center.y + 1;
yCoord.w = center.y + 2;
float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
float4 Dvec;
Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
float4 value;
value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
// (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
float4 PATTERN;
PATTERN.x = kCx * C;
PATTERN.y = kCy * C;
PATTERN.z = kCz * C;
PATTERN.w = PATTERN.z;
float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
// There are five filter patterns (identity, cross, checker,
// theta, phi). Precompute the terms from all of them and then
// use swizzles to assign to color channels.
//
// Channel Matches
// x cross (e.g., EE G)
// y checker (e.g., EE B)
// z theta (e.g., EO R)
// w phi (e.g., EO B)
#define A value.x // A0 + A1
#define B value.y // B0 + B1
#define E value.z // E0 + E1
#define F value.w // F0 + F1
float3 temp;
// PATTERN.yzw += (kD.yz * D).xyy;
temp.x = kDy * D;
temp.y = kDz * D;
PATTERN.y += temp.x;
PATTERN.z += temp.y;
PATTERN.w += temp.y;
// PATTERN += (kA.xyz * A).xyzx;
temp.x = kAx * A;
temp.y = kAy * A;
temp.z = kAz * A;
PATTERN.x += temp.x;
PATTERN.y += temp.y;
PATTERN.z += temp.z;
PATTERN.w += temp.x;
// PATTERN += (kE.xyw * E).xyxz;
temp.x = kEx * E;
temp.y = kEy * E;
temp.z = kEw * E;
PATTERN.x += temp.x;
PATTERN.y += temp.y;
PATTERN.z += temp.x;
PATTERN.w += temp.z;
// PATTERN.xw += kB.xw * B;
PATTERN.x += kBx * B;
PATTERN.w += kBw * B;
// PATTERN.xz += kF.xz * F;
PATTERN.x += kFx * F;
PATTERN.z += kFz * F;
// Determine which of four types of pixels we are on.
int2 alternate;
alternate.x = (x + firstRed.x) % 2;
alternate.y = (y + firstRed.y) % 2;
// in BGR sequence;
uchar3 pixelColor =
(alternate.y == 0) ?
((alternate.x == 0) ?
make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
((alternate.x == 0) ?
make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
dst(y, x) = toDst<DstType>(pixelColor);
}
template <int cn>
void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
{
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
bindTexture(&sourceTex, src);
MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
}}}
#endif /* CUDA_DISABLER */

View File

@@ -1,143 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/utility.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace gfft
{
texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__device__ int g_counter = 0;
template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
{
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
{
float val = tex2D(eigTex, j, i);
if (val > threshold)
{
float maxVal = val;
maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
maxVal = ::fmax(tex2D(eigTex, j , i - 1), maxVal);
maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
maxVal = ::fmax(tex2D(eigTex, j , i + 1), maxVal);
maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
if (val == maxVal)
{
const int ind = ::atomicAdd(&g_counter, 1);
if (ind < max_count)
corners[ind] = make_float2(j, i);
}
}
}
}
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
bindTexture(&eigTex, eig);
dim3 block(16, 16);
dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
if (mask.data)
findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
else
findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
return std::min(count, max_count);
}
class EigGreater
{
public:
__device__ __forceinline__ bool operator()(float2 a, float2 b) const
{
return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
}
};
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
{
bindTexture(&eigTex, eig);
thrust::device_ptr<float2> ptr(corners);
thrust::sort(ptr, ptr + count, EigGreater());
}
} // namespace optical_flow
}}}
#endif /* CUDA_DISABLER */

View File

@@ -1,153 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/transform.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace hist
{
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
{
__shared__ int shist[256];
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
shist[tid] = 0;
__syncthreads();
if (y < rows)
{
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
unsigned int data = rowPtr[x];
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}
if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
unsigned int data = ((const uchar*)rowPtr)[x];
Emulation::smem::atomicAdd(&shist[data], 1);
}
}
}
__syncthreads();
const int histVal = shist[tid];
if (histVal > 0)
::atomicAdd(hist + tid, histVal);
}
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
/////////////////////////////////////////////////////////////////////////
namespace hist
{
__constant__ int c_lut[256];
struct EqualizeHist : unary_function<uchar, uchar>
{
float scale;
__host__ EqualizeHist(float _scale) : scale(_scale) {}
__device__ __forceinline__ uchar operator ()(uchar val) const
{
const int lut = c_lut[val];
return __float2int_rn(scale * lut);
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
{
enum { smart_shift = 4 };
};
}}}
namespace hist
{
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
{
if (stream == 0)
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
else
cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
const float scale = 255.0f / (src.cols * src.rows);
cudev::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
}
}
#endif /* CUDA_DISABLER */

File diff suppressed because it is too large Load Diff

View File

@@ -1,754 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include "internal_shared.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
texture<uchar4, 2> tex_meanshift;
__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
size_t out_step, int cols, int rows,
int sp, int sr, int maxIter, float eps)
{
int isr2 = sr*sr;
uchar4 c = tex2D(tex_meanshift, x0, y0 );
// iterate meanshift procedure
for( int iter = 0; iter < maxIter; iter++ )
{
int count = 0;
int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
float icount;
//mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
int minx = x0-sp;
int miny = y0-sp;
int maxx = x0+sp;
int maxy = y0+sp;
for( int y = miny; y <= maxy; y++)
{
int rowCount = 0;
for( int x = minx; x <= maxx; x++ )
{
uchar4 t = tex2D( tex_meanshift, x, y );
int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
if( norm2 <= isr2 )
{
s0 += t.x; s1 += t.y; s2 += t.z;
sx += x; rowCount++;
}
}
count += rowCount;
sy += y*rowCount;
}
if( count == 0 )
break;
icount = 1.f/count;
int x1 = __float2int_rz(sx*icount);
int y1 = __float2int_rz(sy*icount);
s0 = __float2int_rz(s0*icount);
s1 = __float2int_rz(s1*icount);
s2 = __float2int_rz(s2*icount);
int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
x0 = x1; y0 = y1;
c.x = s0; c.y = s1; c.z = s2;
if( stopFlag )
break;
}
int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
*(uchar4*)(out + base) = c;
return make_short2((short)x0, (short)y0);
}
__global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
{
int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y;
if( x0 < cols && y0 < rows )
do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
}
__global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
unsigned char* outsp, size_t outspstep,
int cols, int rows,
int sp, int sr, int maxIter, float eps)
{
int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y;
if( x0 < cols && y0 < rows )
{
int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
}
}
void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
{
dim3 grid(1, 1, 1);
dim3 threads(32, 8, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
//cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
{
dim3 grid(1, 1, 1);
dim3 threads(32, 8, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
//cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
/////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
template <typename T>
__device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
{
unsigned int H = ((ndisp-d) * 240)/ndisp;
unsigned int hi = (H/60) % 6;
float f = H/60.f - H/60;
float p = V * (1 - S);
float q = V * (1 - f * S);
float t = V * (1 - (1 - f) * S);
float3 res;
if (hi == 0) //R = V, G = t, B = p
{
res.x = p;
res.y = t;
res.z = V;
}
if (hi == 1) // R = q, G = V, B = p
{
res.x = p;
res.y = V;
res.z = q;
}
if (hi == 2) // R = p, G = V, B = t
{
res.x = t;
res.y = V;
res.z = p;
}
if (hi == 3) // R = p, G = q, B = V
{
res.x = V;
res.y = q;
res.z = p;
}
if (hi == 4) // R = t, G = p, B = V
{
res.x = V;
res.y = p;
res.z = t;
}
if (hi == 5) // R = V, G = p, B = q
{
res.x = q;
res.y = p;
res.z = V;
}
const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
const unsigned int a = 255U;
return (a << 24) + (r << 16) + (g << 8) + b;
}
__global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
{
const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < width && y < height)
{
uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
uint4 res;
res.x = cvtPixel(d4.x, ndisp);
res.y = cvtPixel(d4.y, ndisp);
res.z = cvtPixel(d4.z, ndisp);
res.w = cvtPixel(d4.w, ndisp);
uint4* line = (uint4*)(out_image + y * out_step);
line[x >> 2] = res;
}
}
__global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
{
const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < width && y < height)
{
short2 d2 = *(short2*)(disp + y * disp_step + x);
uint2 res;
res.x = cvtPixel(d2.x, ndisp);
res.y = cvtPixel(d2.y, ndisp);
uint2* line = (uint2*)(out_image + y * out_step);
line[x >> 1] = res;
}
}
void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
{
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x << 2);
grid.y = divUp(src.rows, threads.y);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x << 1);
grid.y = divUp(src.rows, threads.y);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
/////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
__constant__ float cq[16];
template <typename T, typename D>
__global__ void reprojectImageTo3D(const PtrStepSz<T> disp, PtrStep<D> xyz)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y >= disp.rows || x >= disp.cols)
return;
const float qx = x * cq[ 0] + y * cq[ 1] + cq[ 3];
const float qy = x * cq[ 4] + y * cq[ 5] + cq[ 7];
const float qz = x * cq[ 8] + y * cq[ 9] + cq[11];
const float qw = x * cq[12] + y * cq[13] + cq[15];
const T d = disp(y, x);
const float iW = 1.f / (qw + cq[14] * d);
D v = VecTraits<D>::all(1.0f);
v.x = (qx + cq[2] * d) * iW;
v.y = (qy + cq[6] * d) * iW;
v.z = (qz + cq[10] * d) * iW;
xyz(y, x) = v;
}
template <typename T, typename D>
void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(disp.cols, block.x), divUp(disp.rows, block.y));
cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((PtrStepSz<T>)disp, (PtrStepSz<D>)xyz);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void reprojectImageTo3D_gpu<uchar, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
template void reprojectImageTo3D_gpu<uchar, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
template void reprojectImageTo3D_gpu<short, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
template void reprojectImageTo3D_gpu<short, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
/////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
float a = 0.f;
float b = 0.f;
float c = 0.f;
const int ibegin = y - (block_size / 2);
const int jbegin = x - (block_size / 2);
const int iend = ibegin + block_size;
const int jend = jbegin + block_size;
for (int i = ibegin; i < iend; ++i)
{
for (int j = jbegin; j < jend; ++j)
{
float dx = tex2D(harrisDxTex, j, i);
float dy = tex2D(harrisDyTex, j, i);
a += dx * dx;
b += dx * dy;
c += dy * dy;
}
}
dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
}
}
template <typename BR, typename BC>
__global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
float a = 0.f;
float b = 0.f;
float c = 0.f;
const int ibegin = y - (block_size / 2);
const int jbegin = x - (block_size / 2);
const int iend = ibegin + block_size;
const int jend = jbegin + block_size;
for (int i = ibegin; i < iend; ++i)
{
const int y = border_col.idx_row(i);
for (int j = jbegin; j < jend; ++j)
{
const int x = border_row.idx_col(j);
float dx = tex2D(harrisDxTex, x, y);
float dy = tex2D(harrisDyTex, x, y);
a += dx * dx;
b += dx * dy;
c += dy * dy;
}
}
dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
}
}
void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
bindTexture(&harrisDxTex, Dx);
bindTexture(&harrisDyTex, Dy);
switch (border_type)
{
case BORDER_REFLECT101_GPU:
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
break;
case BORDER_REFLECT_GPU:
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
break;
case BORDER_REPLICATE_GPU:
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
break;
}
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
/////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
float a = 0.f;
float b = 0.f;
float c = 0.f;
const int ibegin = y - (block_size / 2);
const int jbegin = x - (block_size / 2);
const int iend = ibegin + block_size;
const int jend = jbegin + block_size;
for (int i = ibegin; i < iend; ++i)
{
for (int j = jbegin; j < jend; ++j)
{
float dx = tex2D(minEigenValDxTex, j, i);
float dy = tex2D(minEigenValDyTex, j, i);
a += dx * dx;
b += dx * dy;
c += dy * dy;
}
}
a *= 0.5f;
c *= 0.5f;
dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
}
}
template <typename BR, typename BC>
__global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
float a = 0.f;
float b = 0.f;
float c = 0.f;
const int ibegin = y - (block_size / 2);
const int jbegin = x - (block_size / 2);
const int iend = ibegin + block_size;
const int jend = jbegin + block_size;
for (int i = ibegin; i < iend; ++i)
{
int y = border_col.idx_row(i);
for (int j = jbegin; j < jend; ++j)
{
int x = border_row.idx_col(j);
float dx = tex2D(minEigenValDxTex, x, y);
float dy = tex2D(minEigenValDyTex, x, y);
a += dx * dx;
b += dx * dy;
c += dy * dy;
}
}
a *= 0.5f;
c *= 0.5f;
dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
}
}
void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
bindTexture(&minEigenValDxTex, Dx);
bindTexture(&minEigenValDyTex, Dy);
switch (border_type)
{
case BORDER_REFLECT101_GPU:
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
break;
case BORDER_REFLECT_GPU:
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
break;
case BORDER_REPLICATE_GPU:
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
break;
}
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
//////////////////////////////////////////////////////////////////////////
// buildWarpMaps
// TODO use intrinsics like __sinf and so on
namespace build_warp_maps
{
__constant__ float ck_rinv[9];
__constant__ float cr_kinv[9];
__constant__ float ct[3];
__constant__ float cscale;
}
class PlaneMapper
{
public:
static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
{
using namespace build_warp_maps;
float x_ = u / cscale - ct[0];
float y_ = v / cscale - ct[1];
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
x /= z;
y /= z;
}
};
class CylindricalMapper
{
public:
static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
{
using namespace build_warp_maps;
u /= cscale;
float x_ = ::sinf(u);
float y_ = v / cscale;
float z_ = ::cosf(u);
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
if (z > 0) { x /= z; y /= z; }
else x = y = -1;
}
};
class SphericalMapper
{
public:
static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
{
using namespace build_warp_maps;
v /= cscale;
u /= cscale;
float sinv = ::sinf(v);
float x_ = sinv * ::sinf(u);
float y_ = -::cosf(v);
float z_ = sinv * ::cosf(u);
float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
if (z > 0) { x /= z; y /= z; }
else x = y = -1;
}
};
template <typename Mapper>
__global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
PtrStepf map_x, PtrStepf map_y)
{
int du = blockIdx.x * blockDim.x + threadIdx.x;
int dv = blockIdx.y * blockDim.y + threadIdx.y;
if (du < cols && dv < rows)
{
float u = tl_u + du;
float v = tl_v + dv;
float x, y;
Mapper::mapBackward(u, v, x, y);
map_x.ptr(dv)[du] = x;
map_y.ptr(dv)[du] = y;
}
}
void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
const float k_rinv[9], const float r_kinv[9], const float t[3],
float scale, cudaStream_t stream)
{
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
int cols = map_x.cols;
int rows = map_x.rows;
dim3 threads(32, 8);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
const float k_rinv[9], const float r_kinv[9], float scale,
cudaStream_t stream)
{
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
int cols = map_x.cols;
int rows = map_x.rows;
dim3 threads(32, 8);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
const float k_rinv[9], const float r_kinv[9], float scale,
cudaStream_t stream)
{
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
int cols = map_x.cols;
int rows = map_x.rows;
dim3 threads(32, 8);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace cudev {
#endif /* CUDA_DISABLER */

View File

@@ -1,916 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace match_template
{
__device__ __forceinline__ float sum(float v) { return v; }
__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
__device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
__device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
__device__ __forceinline__ float first(float v) { return v; }
__device__ __forceinline__ float first(float2 v) { return v.x; }
__device__ __forceinline__ float first(float3 v) { return v.x; }
__device__ __forceinline__ float first(float4 v) { return v.x; }
__device__ __forceinline__ float mul(float a, float b) { return a * b; }
__device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
__device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
__device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
__device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
__device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
__device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
__device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
__device__ __forceinline__ float sub(float a, float b) { return a - b; }
__device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
__device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
__device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
__device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
__device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
__device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
__device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
//////////////////////////////////////////////////////////////////////
// Naive_CCORR
template <typename T, int cn>
__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
{
typedef typename TypeVec<T, cn>::vec_type Type;
typedef typename TypeVec<float, cn>::vec_type Typef;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
Typef res = VecTraits<Typef>::all(0);
for (int i = 0; i < h; ++i)
{
const Type* image_ptr = (const Type*)image.ptr(y + i);
const Type* templ_ptr = (const Type*)templ.ptr(i);
for (int j = 0; j < w; ++j)
res = res + mul(image_ptr[x + j], templ_ptr[j]);
}
result.ptr(y)[x] = sum(res);
}
}
template <typename T, int cn>
void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
{
const dim3 threads(32, 8);
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
{
typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
static const caller_t callers[] =
{
0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
};
callers[cn](image, templ, result, stream);
}
void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
{
typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
static const caller_t callers[] =
{
0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
};
callers[cn](image, templ, result, stream);
}
//////////////////////////////////////////////////////////////////////
// Naive_SQDIFF
template <typename T, int cn>
__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
{
typedef typename TypeVec<T, cn>::vec_type Type;
typedef typename TypeVec<float, cn>::vec_type Typef;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
Typef res = VecTraits<Typef>::all(0);
Typef delta;
for (int i = 0; i < h; ++i)
{
const Type* image_ptr = (const Type*)image.ptr(y + i);
const Type* templ_ptr = (const Type*)templ.ptr(i);
for (int j = 0; j < w; ++j)
{
delta = sub(image_ptr[x + j], templ_ptr[j]);
res = res + delta * delta;
}
}
result.ptr(y)[x] = sum(res);
}
}
template <typename T, int cn>
void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
{
const dim3 threads(32, 8);
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
{
typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
static const caller_t callers[] =
{
0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
};
callers[cn](image, templ, result, stream);
}
void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
{
typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
static const caller_t callers[] =
{
0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
};
callers[cn](image, templ, result, stream);
}
//////////////////////////////////////////////////////////////////////
// Prepared_SQDIFF
template <int cn>
__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sqsum_ = (float)(
(image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
(image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
}
}
template <int cn>
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
{
const dim3 threads(32, 8);
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
cudaStream_t stream)
{
typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
static const caller_t callers[] =
{
0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
};
callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
}
//////////////////////////////////////////////////////////////////////
// Prepared_SQDIFF_NORMED
// normAcc* are accurate normalization routines which make GPU matchTemplate
// consistent with CPU one
__device__ float normAcc(float num, float denum)
{
if (::fabs(num) < denum)
return num / denum;
if (::fabs(num) < denum * 1.125f)
return num > 0 ? 1 : -1;
return 0;
}
__device__ float normAcc_SQDIFF(float num, float denum)
{
if (::fabs(num) < denum)
return num / denum;
if (::fabs(num) < denum * 1.125f)
return num > 0 ? 1 : -1;
return 1;
}
template <int cn>
__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
int w, int h, const PtrStep<unsigned long long> image_sqsum,
unsigned long long templ_sqsum, PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sqsum_ = (float)(
(image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
(image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
sqrtf(image_sqsum_ * templ_sqsum));
}
}
template <int cn>
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
PtrStepSzf result, cudaStream_t stream)
{
const dim3 threads(32, 8);
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
PtrStepSzf result, int cn, cudaStream_t stream)
{
typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
static const caller_t callers[] =
{
0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
};
callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
}
//////////////////////////////////////////////////////////////////////
// Prepared_CCOFF
__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sum_ = (float)(
(image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
(image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
}
}
void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
const PtrStep<unsigned int> image_sum_r,
const PtrStep<unsigned int> image_sum_g,
PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sum_r_ = (float)(
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
float image_sum_g_ = (float)(
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
- image_sum_g_ * templ_sum_scale_g;
}
}
void matchTemplatePrepared_CCOFF_8UC2(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g,
unsigned int templ_sum_r, unsigned int templ_sum_g,
PtrStepSzf result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
image_sum_r, image_sum_g, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
int w, int h,
float templ_sum_scale_r,
float templ_sum_scale_g,
float templ_sum_scale_b,
const PtrStep<unsigned int> image_sum_r,
const PtrStep<unsigned int> image_sum_g,
const PtrStep<unsigned int> image_sum_b,
PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sum_r_ = (float)(
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
float image_sum_g_ = (float)(
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
float image_sum_b_ = (float)(
(image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
(image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
- image_sum_g_ * templ_sum_scale_g
- image_sum_b_ * templ_sum_scale_b;
}
}
void matchTemplatePrepared_CCOFF_8UC3(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g,
const PtrStepSz<unsigned int> image_sum_b,
unsigned int templ_sum_r,
unsigned int templ_sum_g,
unsigned int templ_sum_b,
PtrStepSzf result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
w, h,
(float)templ_sum_r / (w * h),
(float)templ_sum_g / (w * h),
(float)templ_sum_b / (w * h),
image_sum_r, image_sum_g, image_sum_b, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
int w, int h,
float templ_sum_scale_r,
float templ_sum_scale_g,
float templ_sum_scale_b,
float templ_sum_scale_a,
const PtrStep<unsigned int> image_sum_r,
const PtrStep<unsigned int> image_sum_g,
const PtrStep<unsigned int> image_sum_b,
const PtrStep<unsigned int> image_sum_a,
PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sum_r_ = (float)(
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
float image_sum_g_ = (float)(
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
float image_sum_b_ = (float)(
(image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
(image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
float image_sum_a_ = (float)(
(image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
(image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
- image_sum_g_ * templ_sum_scale_g
- image_sum_b_ * templ_sum_scale_b
- image_sum_a_ * templ_sum_scale_a;
}
}
void matchTemplatePrepared_CCOFF_8UC4(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g,
const PtrStepSz<unsigned int> image_sum_b,
const PtrStepSz<unsigned int> image_sum_a,
unsigned int templ_sum_r,
unsigned int templ_sum_g,
unsigned int templ_sum_b,
unsigned int templ_sum_a,
PtrStepSzf result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
w, h,
(float)templ_sum_r / (w * h),
(float)templ_sum_g / (w * h),
(float)templ_sum_b / (w * h),
(float)templ_sum_a / (w * h),
image_sum_r, image_sum_g, image_sum_b, image_sum_a,
result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////
// Prepared_CCOFF_NORMED
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
int w, int h, float weight,
float templ_sum_scale, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum,
const PtrStep<unsigned long long> image_sqsum,
PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float ccorr = result.ptr(y)[x];
float image_sum_ = (float)(
(image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
(image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
float image_sqsum_ = (float)(
(image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
(image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
}
}
void matchTemplatePrepared_CCOFF_NORMED_8U(
int w, int h, const PtrStepSz<unsigned int> image_sum,
const PtrStepSz<unsigned long long> image_sqsum,
unsigned int templ_sum, unsigned long long templ_sqsum,
PtrStepSzf result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
float weight = 1.f / (w * h);
float templ_sum_scale = templ_sum * weight;
float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
w, h, weight, templ_sum_scale, templ_sqsum_scale,
image_sum, image_sqsum, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g,
float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sum_r_ = (float)(
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
float image_sqsum_r_ = (float)(
(image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
(image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
float image_sum_g_ = (float)(
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
float image_sqsum_g_ = (float)(
(image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
(image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
- image_sum_g_ * templ_sum_scale_g;
float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+ image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
result.ptr(y)[x] = normAcc(num, denum);
}
}
void matchTemplatePrepared_CCOFF_NORMED_8UC2(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
PtrStepSzf result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
float weight = 1.f / (w * h);
float templ_sum_scale_r = templ_sum_r * weight;
float templ_sum_scale_g = templ_sum_g * weight;
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
w, h, weight,
templ_sum_scale_r, templ_sum_scale_g,
templ_sqsum_scale,
image_sum_r, image_sqsum_r,
image_sum_g, image_sqsum_g,
result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sum_r_ = (float)(
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
float image_sqsum_r_ = (float)(
(image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
(image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
float image_sum_g_ = (float)(
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
float image_sqsum_g_ = (float)(
(image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
(image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
float image_sum_b_ = (float)(
(image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
(image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
float image_sqsum_b_ = (float)(
(image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
(image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
- image_sum_g_ * templ_sum_scale_g
- image_sum_b_ * templ_sum_scale_b;
float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+ image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+ image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
result.ptr(y)[x] = normAcc(num, denum);
}
}
void matchTemplatePrepared_CCOFF_NORMED_8UC3(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
PtrStepSzf result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
float weight = 1.f / (w * h);
float templ_sum_scale_r = templ_sum_r * weight;
float templ_sum_scale_g = templ_sum_g * weight;
float templ_sum_scale_b = templ_sum_b * weight;
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+ templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
w, h, weight,
templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
templ_sqsum_scale,
image_sum_r, image_sqsum_r,
image_sum_g, image_sqsum_g,
image_sum_b, image_sqsum_b,
result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
float templ_sum_scale_a, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sum_r_ = (float)(
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
float image_sqsum_r_ = (float)(
(image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
(image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
float image_sum_g_ = (float)(
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
float image_sqsum_g_ = (float)(
(image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
(image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
float image_sum_b_ = (float)(
(image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
(image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
float image_sqsum_b_ = (float)(
(image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
(image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
float image_sum_a_ = (float)(
(image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
(image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
float image_sqsum_a_ = (float)(
(image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
(image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
- image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+ image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+ image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
+ image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
result.ptr(y)[x] = normAcc(num, denum);
}
}
void matchTemplatePrepared_CCOFF_NORMED_8UC4(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
PtrStepSzf result, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
float weight = 1.f / (w * h);
float templ_sum_scale_r = templ_sum_r * weight;
float templ_sum_scale_g = templ_sum_g * weight;
float templ_sum_scale_b = templ_sum_b * weight;
float templ_sum_scale_a = templ_sum_a * weight;
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+ templ_sqsum_b - weight * templ_sum_b * templ_sum_b
+ templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
w, h, weight,
templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
templ_sqsum_scale,
image_sum_r, image_sqsum_r,
image_sum_g, image_sqsum_g,
image_sum_b, image_sqsum_b,
image_sum_a, image_sqsum_a,
result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////
// normalize
template <int cn>
__global__ void normalizeKernel_8U(
int w, int h, const PtrStep<unsigned long long> image_sqsum,
unsigned long long templ_sqsum, PtrStepSzf result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sqsum_ = (float)(
(image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
(image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
}
}
void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
switch (cn)
{
case 1:
normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
break;
case 2:
normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
break;
case 3:
normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
break;
case 4:
normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
break;
}
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////
// extractFirstChannel
template <int cn>
__global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
{
typedef typename TypeVec<float, cn>::vec_type Typef;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
Typef val = ((const Typef*)image.ptr(y))[x];
result.ptr(y)[x] = first(val);
}
}
void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
switch (cn)
{
case 1:
extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
break;
case 2:
extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
break;
case 3:
extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
break;
case 4:
extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
break;
}
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
} //namespace match_template
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,569 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
using namespace cv::gpu;
typedef unsigned char uchar;
typedef unsigned short ushort;
//////////////////////////////////////////////////////////////////////////////////
//// Non Local Means Denosing
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
__device__ __forceinline__ float norm2(const float& v) { return v*v; }
__device__ __forceinline__ float norm2(const float2& v) { return v.x*v.x + v.y*v.y; }
__device__ __forceinline__ float norm2(const float3& v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
__device__ __forceinline__ float norm2(const float4& v) { return v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w; }
template<typename T, typename B>
__global__ void nlm_kernel(const PtrStep<T> src, PtrStepSz<T> dst, const B b, int search_radius, int block_radius, float noise_mult)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int i = blockDim.y * blockIdx.y + threadIdx.y;
const int j = blockDim.x * blockIdx.x + threadIdx.x;
if (j >= dst.cols || i >= dst.rows)
return;
int bsize = search_radius + block_radius;
int search_window = 2 * search_radius + 1;
float minus_search_window2_inv = -1.f/(search_window * search_window);
value_type sum1 = VecTraits<value_type>::all(0);
float sum2 = 0.f;
if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows)
{
for(float y = -search_radius; y <= search_radius; ++y)
for(float x = -search_radius; x <= search_radius; ++x)
{
float dist2 = 0;
for(float ty = -block_radius; ty <= block_radius; ++ty)
for(float tx = -block_radius; tx <= block_radius; ++tx)
{
value_type bv = saturate_cast<value_type>(src(i + y + ty, j + x + tx));
value_type av = saturate_cast<value_type>(src(i + ty, j + tx));
dist2 += norm2(av - bv);
}
float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
/*if (i == 255 && j == 255)
printf("%f %f\n", w, dist2 * minus_h2_inv + (x * x + y * y) * minus_search_window2_inv);*/
sum1 = sum1 + w * saturate_cast<value_type>(src(i + y, j + x));
sum2 += w;
}
}
else
{
for(float y = -search_radius; y <= search_radius; ++y)
for(float x = -search_radius; x <= search_radius; ++x)
{
float dist2 = 0;
for(float ty = -block_radius; ty <= block_radius; ++ty)
for(float tx = -block_radius; tx <= block_radius; ++tx)
{
value_type bv = saturate_cast<value_type>(b.at(i + y + ty, j + x + tx, src));
value_type av = saturate_cast<value_type>(b.at(i + ty, j + tx, src));
dist2 += norm2(av - bv);
}
float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
sum1 = sum1 + w * saturate_cast<value_type>(b.at(i + y, j + x, src));
sum2 += w;
}
}
dst(i, j) = saturate_cast<T>(sum1 / sum2);
}
template<typename T, template <typename> class B>
void nlm_caller(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream)
{
dim3 block (32, 8);
dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
B<T> b(src.rows, src.cols);
int block_window = 2 * block_radius + 1;
float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);
float noise_mult = minus_h2_inv/(block_window * block_window);
cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel<T, B<T> >, cudaFuncCachePreferL1) );
nlm_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, search_radius, block_radius, noise_mult);
cudaSafeCall ( cudaGetLastError () );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template<typename T>
void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream)
{
typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream);
static func_t funcs[] =
{
nlm_caller<T, BrdReflect101>,
nlm_caller<T, BrdReplicate>,
nlm_caller<T, BrdConstant>,
nlm_caller<T, BrdReflect>,
nlm_caller<T, BrdWrap>,
};
funcs[borderMode](src, dst, search_radius, block_radius, h, stream);
}
template void nlm_bruteforce_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
template void nlm_bruteforce_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
template void nlm_bruteforce_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
}
}}}
//////////////////////////////////////////////////////////////////////////////////
//// Non Local Means Denosing (fast approximate version)
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <int cn> struct Unroll;
template <> struct Unroll<1>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
{
return thrust::tie(val1, val2);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op);
}
};
template <> struct Unroll<2>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
{
return thrust::tie(val1, val2.x, val2.y);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op);
}
};
template <> struct Unroll<3>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
{
return thrust::tie(val1, val2.x, val2.y, val2.z);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op, op);
}
};
template <> struct Unroll<4>
{
template <int BLOCK_SIZE>
static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
{
return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
}
static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
{
return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
}
static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
{
plus<float> op;
return thrust::make_tuple(op, op, op, op, op);
}
};
__device__ __forceinline__ int calcDist(const uchar& a, const uchar& b) { return (a-b)*(a-b); }
__device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
__device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
template <class T> struct FastNonLocalMenas
{
enum
{
CTA_SIZE = 128,
TILE_COLS = 128,
TILE_ROWS = 32,
STRIDE = CTA_SIZE
};
struct plus
{
__device__ __forceinline__ float operator()(float v1, float v2) const { return v1 + v2; }
};
int search_radius;
int block_radius;
int search_window;
int block_window;
float minus_h2_inv;
FastNonLocalMenas(int search_window_, int block_window_, float h) : search_radius(search_window_/2), block_radius(block_window_/2),
search_window(search_window_), block_window(block_window_), minus_h2_inv(-1.f/(h * h * VecTraits<T>::cn)) {}
PtrStep<T> src;
mutable PtrStepi buffer;
__device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
{
for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
{
dist_sums[index] = 0;
for(int tx = 0; tx < block_window; ++tx)
col_sums(tx, index) = 0;
int y = index / search_window;
int x = index - y * search_window;
int ay = i;
int ax = j;
int by = i + y - search_radius;
int bx = j + x - search_radius;
#if 1
for (int tx = -block_radius; tx <= block_radius; ++tx)
{
int col_sum = 0;
for (int ty = -block_radius; ty <= block_radius; ++ty)
{
int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
dist_sums[index] += dist;
col_sum += dist;
}
col_sums(tx + block_radius, index) = col_sum;
}
#else
for (int ty = -block_radius; ty <= block_radius; ++ty)
for (int tx = -block_radius; tx <= block_radius; ++tx)
{
int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
dist_sums[index] += dist;
col_sums(tx + block_radius, index) += dist;
}
#endif
up_col_sums(j, index) = col_sums(block_window - 1, index);
}
}
__device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
{
for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
{
int y = index / search_window;
int x = index - y * search_window;
int ay = i;
int ax = j + block_radius;
int by = i + y - search_radius;
int bx = j + x - search_radius + block_radius;
int col_sum = 0;
for (int ty = -block_radius; ty <= block_radius; ++ty)
col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));
dist_sums[index] += col_sum - col_sums(first, index);
col_sums(first, index) = col_sum;
up_col_sums(j, index) = col_sum;
}
}
__device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
{
int ay = i;
int ax = j + block_radius;
T a_up = src(ay - block_radius - 1, ax);
T a_down = src(ay + block_radius, ax);
for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
{
int y = index / search_window;
int x = index - y * search_window;
int by = i + y - search_radius;
int bx = j + x - search_radius + block_radius;
T b_up = src(by - block_radius - 1, bx);
T b_down = src(by + block_radius, bx);
int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
dist_sums[index] += col_sum - col_sums(first, index);
col_sums(first, index) = col_sum;
up_col_sums(j, index) = col_sum;
}
}
__device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums, T& dst) const
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_type;
float weights_sum = 0;
sum_type sum = VecTraits<sum_type>::all(0);
float bw2_inv = 1.f/(block_window * block_window);
int sx = j - search_radius;
int sy = i - search_radius;
for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
{
int y = index / search_window;
int x = index - y * search_window;
float avg_dist = dist_sums[index] * bw2_inv;
float weight = __expf(avg_dist * minus_h2_inv);
weights_sum += weight;
sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
}
__shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
threadIdx.x,
Unroll<VecTraits<T>::cn>::op());
if (threadIdx.x == 0)
dst = saturate_cast<T>(sum / weights_sum);
}
__device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
{
int tbx = blockIdx.x * TILE_COLS;
int tby = blockIdx.y * TILE_ROWS;
int tex = ::min(tbx + TILE_COLS, dst.cols);
int tey = ::min(tby + TILE_ROWS, dst.rows);
PtrStepi col_sums;
col_sums.data = buffer.ptr(dst.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
col_sums.step = buffer.step;
PtrStepi up_col_sums;
up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
up_col_sums.step = buffer.step;
extern __shared__ int dist_sums[]; //search_window * search_window
int first = 0;
for (int i = tby; i < tey; ++i)
for (int j = tbx; j < tex; ++j)
{
__syncthreads();
if (j == tbx)
{
initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
first = 0;
}
else
{
if (i == tby)
shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
else
shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
first = (first + 1) % block_window;
}
__syncthreads();
convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j));
}
}
};
template<typename T>
__global__ void fast_nlm_kernel(const FastNonLocalMenas<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }
void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
{
typedef FastNonLocalMenas<uchar> FNLM;
dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
buffer_cols = search_window * search_window * grid.y;
buffer_rows = src.cols + block_window * grid.x;
}
template<typename T>
void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
int search_window, int block_window, float h, cudaStream_t stream)
{
typedef FastNonLocalMenas<T> FNLM;
FNLM fnlm(search_window, block_window, h);
fnlm.src = (PtrStepSz<T>)src;
fnlm.buffer = buffer;
dim3 block(FNLM::CTA_SIZE, 1);
dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
int smem = search_window * search_window * sizeof(int);
fast_nlm_kernel<<<grid, block, smem>>>(fnlm, (PtrStepSz<T>)dst);
cudaSafeCall ( cudaGetLastError () );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void nlm_fast_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
template void nlm_fast_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
template void nlm_fast_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
__global__ void fnlm_split_kernel(const PtrStepSz<uchar3> lab, PtrStepb l, PtrStep<uchar2> ab)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < lab.cols && y < lab.rows)
{
uchar3 p = lab(y, x);
ab(y,x) = make_uchar2(p.y, p.z);
l(y,x) = p.x;
}
}
void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream)
{
dim3 b(32, 8);
dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
fnlm_split_kernel<<<g, b>>>(lab, l, ab);
cudaSafeCall ( cudaGetLastError () );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void fnlm_merge_kernel(const PtrStepb l, const PtrStep<uchar2> ab, PtrStepSz<uchar3> lab)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < lab.cols && y < lab.rows)
{
uchar2 p = ab(y, x);
lab(y, x) = make_uchar3(l(y, x), p.x, p.y);
}
}
void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream)
{
dim3 b(32, 8);
dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
fnlm_merge_kernel<<<g, b>>>(l, ab, lab);
cudaSafeCall ( cudaGetLastError () );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
}}}
#endif /* CUDA_DISABLER */

View File

@@ -1,228 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
__shared__ work_t smem[256 + 4];
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y;
const int src_y = 2 * y;
if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
{
{
work_t sum;
sum = 0.0625f * src(src_y - 2, x);
sum = sum + 0.25f * src(src_y - 1, x);
sum = sum + 0.375f * src(src_y , x);
sum = sum + 0.25f * src(src_y + 1, x);
sum = sum + 0.0625f * src(src_y + 2, x);
smem[2 + threadIdx.x] = sum;
}
if (threadIdx.x < 2)
{
const int left_x = x - 2;
work_t sum;
sum = 0.0625f * src(src_y - 2, left_x);
sum = sum + 0.25f * src(src_y - 1, left_x);
sum = sum + 0.375f * src(src_y , left_x);
sum = sum + 0.25f * src(src_y + 1, left_x);
sum = sum + 0.0625f * src(src_y + 2, left_x);
smem[threadIdx.x] = sum;
}
if (threadIdx.x > 253)
{
const int right_x = x + 2;
work_t sum;
sum = 0.0625f * src(src_y - 2, right_x);
sum = sum + 0.25f * src(src_y - 1, right_x);
sum = sum + 0.375f * src(src_y , right_x);
sum = sum + 0.25f * src(src_y + 1, right_x);
sum = sum + 0.0625f * src(src_y + 2, right_x);
smem[4 + threadIdx.x] = sum;
}
}
else
{
{
work_t sum;
sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
sum = sum + 0.375f * src(src_y , b.idx_col_high(x));
sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
smem[2 + threadIdx.x] = sum;
}
if (threadIdx.x < 2)
{
const int left_x = x - 2;
work_t sum;
sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
sum = sum + 0.375f * src(src_y , b.idx_col(left_x));
sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
smem[threadIdx.x] = sum;
}
if (threadIdx.x > 253)
{
const int right_x = x + 2;
work_t sum;
sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
sum = sum + 0.375f * src(src_y , b.idx_col_high(right_x));
sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
smem[4 + threadIdx.x] = sum;
}
}
__syncthreads();
if (threadIdx.x < 128)
{
const int tid2 = threadIdx.x * 2;
work_t sum;
sum = 0.0625f * smem[2 + tid2 - 2];
sum = sum + 0.25f * smem[2 + tid2 - 1];
sum = sum + 0.375f * smem[2 + tid2 ];
sum = sum + 0.25f * smem[2 + tid2 + 1];
sum = sum + 0.0625f * smem[2 + tid2 + 2];
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
if (dst_x < dst_cols)
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
}
}
template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
{
const dim3 block(256);
const dim3 grid(divUp(src.cols, block.x), dst.rows);
B<T> b(src.rows, src.cols);
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
}
template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,196 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ sum_t s_srcPatch[10][10];
__shared__ sum_t s_dstPatch[20][16];
if (threadIdx.x < 10 && threadIdx.y < 10)
{
int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
srcx = ::abs(srcx);
srcx = ::min(src.cols - 1, srcx);
srcy = ::abs(srcy);
srcy = ::min(src.rows - 1, srcy);
s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
}
__syncthreads();
sum_t sum = VecTraits<sum_t>::all(0);
const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
const int oddFlag = static_cast<int>((threadIdx.x & 1) != 0);
const bool eveny = ((threadIdx.y & 1) == 0);
const int tidx = threadIdx.x;
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
if (threadIdx.y < 2)
{
sum = VecTraits<sum_t>::all(0);
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[threadIdx.y][threadIdx.x] = sum;
}
if (threadIdx.y > 13)
{
sum = VecTraits<sum_t>::all(0);
if (eveny)
{
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
}
__syncthreads();
sum = VecTraits<sum_t>::all(0);
const int tidy = threadIdx.y;
sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][threadIdx.x];
sum = sum + 0.375f * s_dstPatch[2 + tidy ][threadIdx.x];
sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][threadIdx.x];
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
if (x < dst.cols && y < dst.rows)
dst(y, x) = saturate_cast<T>(4.0f * sum);
}
template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
pyrUp<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
}
template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,274 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/filters.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = mapx.ptr(y)[x];
const float ycoo = mapy.ptr(y)[x];
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
{
static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
{
(void)srcWhole;
(void)xoff;
(void)yoff;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_remap_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
int xoff, yoff; \
tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
} \
}; \
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
PtrStepSz< type > dst, const float* borderValue, bool cc20) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , srcWhole); \
tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
PtrStepSz< type > dst, const float*, bool) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , srcWhole); \
tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
} \
else \
{ \
BrdReplicate<type> brd(src.rows, src.cols); \
BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
{
if (stream == 0)
RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
else
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
}
};
template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
static const caller_t callers[3][5] =
{
{
RemapDispatcher<PointFilter, BrdReflect101, T>::call,
RemapDispatcher<PointFilter, BrdReplicate, T>::call,
RemapDispatcher<PointFilter, BrdConstant, T>::call,
RemapDispatcher<PointFilter, BrdReflect, T>::call,
RemapDispatcher<PointFilter, BrdWrap, T>::call
},
{
RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
RemapDispatcher<LinearFilter, BrdConstant, T>::call,
RemapDispatcher<LinearFilter, BrdReflect, T>::call,
RemapDispatcher<LinearFilter, BrdWrap, T>::call
},
{
RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
RemapDispatcher<CubicFilter, BrdConstant, T>::call,
RemapDispatcher<CubicFilter, BrdReflect, T>::call,
RemapDispatcher<CubicFilter, BrdWrap, T>::call
}
};
callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
}
template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,302 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include <cfloat>
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/filters.hpp"
#include "opencv2/core/cuda/scan.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = x * fx;
const float ycoo = y * fy;
dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
}
}
template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
dst(y, x) = saturate_cast<T>(src(y, x));
}
}
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
{
static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
{
static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdConstant<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
{
static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdConstant<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
{
(void)srcWhole;
(void)xoff;
(void)yoff;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_resize_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
const int xoff; \
const int yoff; \
__host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
} \
}; \
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_resize_ ## type, srcWhole); \
tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
} \
else \
{ \
BrdReplicate< type > brd(src.rows, src.cols); \
BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
template <template <typename> class Filter, typename T> struct ResizeDispatcher
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
{
if (stream == 0)
ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
else
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
}
};
template <typename T> struct ResizeDispatcher<AreaFilter, T>
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
{
(void)srcWhole;
(void)xoff;
(void)yoff;
int iscale_x = (int)round(fx);
int iscale_y = (int)round(fy);
if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
else
ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
}
};
template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
PtrStepSzb dst, int interpolation, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
static const caller_t callers[4] =
{
ResizeDispatcher<PointFilter, T>::call,
ResizeDispatcher<LinearFilter, T>::call,
ResizeDispatcher<CubicFilter, T>::call,
ResizeDispatcher<AreaFilter, T>::call
};
// chenge to linear if area interpolation upscaling
if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
interpolation = 1;
callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
static_cast< PtrStepSz<T> >(dst), stream);
}
template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
template<typename T> struct scan_traits{};
template<> struct scan_traits<uchar>
{
typedef float scan_line_type;
};
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,389 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/filters.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
__constant__ float c_warpMat[3 * 3];
struct AffineTransform
{
static __device__ __forceinline__ float2 calcCoord(int x, int y)
{
const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
return make_float2(xcoo, ycoo);
}
};
struct PerspectiveTransform
{
static __device__ __forceinline__ float2 calcCoord(int x, int y)
{
const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
return make_float2(xcoo, ycoo);
}
};
///////////////////////////////////////////////////////////////////
// Build Maps
template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < xmap.cols && y < xmap.rows)
{
const float2 coord = Transform::calcCoord(x, y);
xmap(y, x) = coord.x;
ymap(y, x) = coord.y;
}
}
template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
}
void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
}
///////////////////////////////////////////////////////////////////
// Warp
template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float2 coord = Transform::calcCoord(x, y);
dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
}
}
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
{
static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
{
(void)xoff;
(void)yoff;
(void)srcWhole;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
warp<Transform><<<grid, block>>>(filter_src, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_warp_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
int xoff, yoff; \
tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
} \
}; \
template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_warp_ ## type , srcWhole); \
tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_warp_ ## type , srcWhole); \
tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst); \
} \
else \
{ \
BrdReplicate<type> brd(src.rows, src.cols); \
BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
warp<Transform><<<grid, block>>>(filter_src, dst); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
//OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_WARP_TEX
template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
{
static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
{
if (stream == 0)
WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
else
WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
}
};
template <class Transform, typename T>
void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
static const func_t funcs[3][5] =
{
{
WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
},
{
WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
},
{
WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
}
};
funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
}
template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
}
template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
{
cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
}
template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
//template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,274 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __cvt_color_internal_h__
#define __cvt_color_internal_h__
namespace cv { namespace gpu { namespace cudev
{
#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(name) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgra)
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL
}}}
#endif

View File

@@ -1,198 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::gpu::bilateralFilter(const GpuMat&, GpuMat&, int, float, float, int, Stream&) { throw_no_cuda(); }
void cv::gpu::nonLocalMeans(const GpuMat&, GpuMat&, float, int, int, int, Stream&) { throw_no_cuda(); }
void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat&, GpuMat&, float, int, int, Stream&) { throw_no_cuda(); }
void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat&, GpuMat&, float, float, int, int, Stream&) { throw_no_cuda(); }
#else
//////////////////////////////////////////////////////////////////////////////////
//// Non Local Means Denosing (brute force)
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template<typename T>
void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t stream);
template<typename T>
void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
}
}}}
void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& s)
{
using cv::gpu::cudev::imgproc::bilateral_filter_gpu;
typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);
static const func_t funcs[6][4] =
{
{bilateral_filter_gpu<uchar> , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3> , bilateral_filter_gpu<uchar4> },
{0 /*bilateral_filter_gpu<schar>*/, 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/, 0 /*bilateral_filter_gpu<schar4>*/},
{bilateral_filter_gpu<ushort> , 0 /*bilateral_filter_gpu<ushort2>*/, bilateral_filter_gpu<ushort3> , bilateral_filter_gpu<ushort4> },
{bilateral_filter_gpu<short> , 0 /*bilateral_filter_gpu<short2>*/ , bilateral_filter_gpu<short3> , bilateral_filter_gpu<short4> },
{0 /*bilateral_filter_gpu<int>*/ , 0 /*bilateral_filter_gpu<int2>*/ , 0 /*bilateral_filter_gpu<int3>*/ , 0 /*bilateral_filter_gpu<int4>*/ },
{bilateral_filter_gpu<float> , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3> , bilateral_filter_gpu<float4> }
};
sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
int radius = (kernel_size <= 0) ? cvRound(sigma_spatial*1.5) : kernel_size/2;
kernel_size = std::max(radius, 1)*2 + 1;
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
const func_t func = funcs[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
dst.create(src.size(), src.type());
func(src, dst, kernel_size, sigma_spatial, sigma_color, gpuBorderType, StreamAccessor::getStream(s));
}
void cv::gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, int borderMode, Stream& s)
{
using cv::gpu::cudev::imgproc::nlm_bruteforce_gpu;
typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
static const func_t funcs[4] = { nlm_bruteforce_gpu<uchar>, nlm_bruteforce_gpu<uchar2>, nlm_bruteforce_gpu<uchar3>, 0/*nlm_bruteforce_gpu<uchar4>,*/ };
CV_Assert(src.type() == CV_8U || src.type() == CV_8UC2 || src.type() == CV_8UC3);
const func_t func = funcs[src.channels() - 1];
CV_Assert(func != 0);
int b = borderMode;
CV_Assert(b == BORDER_REFLECT101 || b == BORDER_REPLICATE || b == BORDER_CONSTANT || b == BORDER_REFLECT || b == BORDER_WRAP);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
dst.create(src.size(), src.type());
func(src, dst, search_window/2, block_window/2, h, gpuBorderType, StreamAccessor::getStream(s));
}
//////////////////////////////////////////////////////////////////////////////////
//// Non Local Means Denosing (fast approxinate)
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
template<typename T>
void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
int search_window, int block_window, float h, cudaStream_t stream);
void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream);
void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream);
}
}}}
void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s)
{
CV_Assert(src.depth() == CV_8U && src.channels() < 4);
int border_size = search_window/2 + block_window/2;
Size esize = src.size() + Size(border_size, border_size) * 2;
cv::gpu::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);
GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step);
cv::gpu::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s);
GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size()));
int bcols, brows;
cudev::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows);
buffer.create(brows, bcols, CV_32S);
using namespace cv::gpu::cudev::imgproc;
typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};
dst.create(src.size(), src.type());
funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s));
}
void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
{
CV_Assert(src.type() == CV_8UC3);
lab.create(src.size(), src.type());
cv::gpu::cvtColor(src, lab, cv::COLOR_BGR2Lab, 0, s);
l.create(src.size(), CV_8U);
ab.create(src.size(), CV_8UC2);
cudev::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
simpleMethod(l, l, h_luminance, search_window, block_window, s);
simpleMethod(ab, ab, h_color, search_window, block_window, s);
cudev::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s));
cv::gpu::cvtColor(lab, dst, cv::COLOR_Lab2BGR, 0, s);
}
#endif

View File

@@ -1,169 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat&, GpuMat&, const GpuMat&) { throw_no_cuda(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace cudev
{
namespace gfft
{
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count);
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count);
}
}}}
void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask)
{
using namespace cv::gpu::cudev::gfft;
CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
ensureSizeIsEnough(image.size(), CV_32F, eig_);
if (useHarrisDetector)
cornerHarris(image, eig_, Dx_, Dy_, buf_, blockSize, 3, harrisK);
else
cornerMinEigenVal(image, eig_, Dx_, Dy_, buf_, blockSize, 3);
double maxVal = 0;
minMax(eig_, 0, &maxVal, GpuMat(), minMaxbuf_);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);
if (total == 0)
{
corners.release();
return;
}
sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);
if (minDistance < 1)
tmpCorners_.colRange(0, maxCorners > 0 ? std::min(maxCorners, total) : total).copyTo(corners);
else
{
std::vector<Point2f> tmp(total);
Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
tmpCorners_.colRange(0, total).download(tmpMat);
std::vector<Point2f> tmp2;
tmp2.reserve(total);
const int cell_size = cvRound(minDistance);
const int grid_width = (image.cols + cell_size - 1) / cell_size;
const int grid_height = (image.rows + cell_size - 1) / cell_size;
std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
for (int i = 0; i < total; ++i)
{
Point2f p = tmp[i];
bool good = true;
int x_cell = static_cast<int>(p.x / cell_size);
int y_cell = static_cast<int>(p.y / cell_size);
int x1 = x_cell - 1;
int y1 = y_cell - 1;
int x2 = x_cell + 1;
int y2 = y_cell + 1;
// boundary check
x1 = std::max(0, x1);
y1 = std::max(0, y1);
x2 = std::min(grid_width - 1, x2);
y2 = std::min(grid_height - 1, y2);
for (int yy = y1; yy <= y2; yy++)
{
for (int xx = x1; xx <= x2; xx++)
{
std::vector<Point2f>& m = grid[yy * grid_width + xx];
if (!m.empty())
{
for(size_t j = 0; j < m.size(); j++)
{
float dx = p.x - m[j].x;
float dy = p.y - m[j].y;
if (dx * dx + dy * dy < minDistance * minDistance)
{
good = false;
goto break_out;
}
}
}
}
}
break_out:
if(good)
{
grid[y_cell * grid_width + x_cell].push_back(p);
tmp2.push_back(p);
if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
break;
}
}
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
}
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -1,282 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_no_cuda(); }
void cv::gpu::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace cudev
{
namespace ccl
{
void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream);
template<typename T>
void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
}
}}}
static float4 scalarToCudaType(const cv::Scalar& in)
{
return make_float4((float)in[0], (float)in[1], (float)in[2], (float)in[3]);
}
void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
{
CV_Assert(!image.empty());
int ch = image.channels();
CV_Assert(ch <= 4);
int depth = image.depth();
typedef void (*func_t)(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
static const func_t suppotLookup[8][4] =
{ // 1, 2, 3, 4
{ cudev::ccl::computeEdges<uchar>, 0, cudev::ccl::computeEdges<uchar3>, cudev::ccl::computeEdges<uchar4> },// CV_8U
{ 0, 0, 0, 0 },// CV_16U
{ cudev::ccl::computeEdges<ushort>, 0, cudev::ccl::computeEdges<ushort3>, cudev::ccl::computeEdges<ushort4> },// CV_8S
{ 0, 0, 0, 0 },// CV_16S
{ cudev::ccl::computeEdges<int>, 0, 0, 0 },// CV_32S
{ cudev::ccl::computeEdges<float>, 0, 0, 0 },// CV_32F
{ 0, 0, 0, 0 },// CV_64F
{ 0, 0, 0, 0 } // CV_USRTYPE1
};
func_t f = suppotLookup[depth][ch - 1];
CV_Assert(f);
if (image.size() != mask.size() || mask.type() != CV_8UC1)
mask.create(image.size(), CV_8UC1);
cudaStream_t stream = StreamAccessor::getStream(s);
float4 culo = scalarToCudaType(lo), cuhi = scalarToCudaType(hi);
f(image, mask, culo, cuhi, stream);
}
void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
{
CV_Assert(!mask.empty() && mask.type() == CV_8U);
if (!deviceSupports(SHARED_ATOMICS))
CV_Error(cv::Error::StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
components.create(mask.size(), CV_32SC1);
cudaStream_t stream = StreamAccessor::getStream(s);
cudev::ccl::labelComponents(mask, components, flags, stream);
}
namespace
{
typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem);
class NppiGraphcutStateHandler
{
public:
NppiGraphcutStateHandler(NppiSize sznpp, Npp8u* pDeviceMem, const init_func_t func)
{
nppSafeCall( func(sznpp, &pState, pDeviceMem) );
}
~NppiGraphcutStateHandler()
{
nppSafeCall( nppiGraphcutFree(pState) );
}
operator NppiGraphcutState*()
{
return pState;
}
private:
NppiGraphcutState* pState;
};
}
void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
{
#if (CUDA_VERSION < 5000)
CV_Assert(terminals.type() == CV_32S);
#else
CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
#endif
Size src_size = terminals.size();
CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
CV_Assert(leftTransp.type() == terminals.type());
CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
CV_Assert(rightTransp.type() == terminals.type());
CV_Assert(top.size() == src_size);
CV_Assert(top.type() == terminals.type());
CV_Assert(bottom.size() == src_size);
CV_Assert(bottom.type() == terminals.type());
labels.create(src_size, CV_8U);
NppiSize sznpp;
sznpp.width = src_size.width;
sznpp.height = src_size.height;
int bufsz;
nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) );
ensureSizeIsEnough(1, bufsz, CV_8U, buf);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcutInitAlloc);
#if (CUDA_VERSION < 5000)
nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
#else
if (terminals.type() == CV_32S)
{
nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
}
else
{
nppSafeCall( nppiGraphcut_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(), top.ptr<Npp32f>(), bottom.ptr<Npp32f>(),
static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
}
#endif
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
{
#if (CUDA_VERSION < 5000)
CV_Assert(terminals.type() == CV_32S);
#else
CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
#endif
Size src_size = terminals.size();
CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
CV_Assert(leftTransp.type() == terminals.type());
CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
CV_Assert(rightTransp.type() == terminals.type());
CV_Assert(top.size() == src_size);
CV_Assert(top.type() == terminals.type());
CV_Assert(topLeft.size() == src_size);
CV_Assert(topLeft.type() == terminals.type());
CV_Assert(topRight.size() == src_size);
CV_Assert(topRight.type() == terminals.type());
CV_Assert(bottom.size() == src_size);
CV_Assert(bottom.type() == terminals.type());
CV_Assert(bottomLeft.size() == src_size);
CV_Assert(bottomLeft.type() == terminals.type());
CV_Assert(bottomRight.size() == src_size);
CV_Assert(bottomRight.type() == terminals.type());
labels.create(src_size, CV_8U);
NppiSize sznpp;
sznpp.width = src_size.width;
sznpp.height = src_size.height;
int bufsz;
nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );
ensureSizeIsEnough(1, bufsz, CV_8U, buf);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);
#if (CUDA_VERSION < 5000)
nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
#else
if (terminals.type() == CV_32S)
{
nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
}
else
{
nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
}
#endif
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
#endif /* !defined (HAVE_CUDA) */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,439 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
#else
namespace cv { namespace gpu { namespace cudev
{
namespace match_template
{
void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
int cn, cudaStream_t stream);
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
int cn, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8UC2(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g,
unsigned int templ_sum_r,
unsigned int templ_sum_g,
PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8UC3(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g,
const PtrStepSz<unsigned int> image_sum_b,
unsigned int templ_sum_r,
unsigned int templ_sum_g,
unsigned int templ_sum_b,
PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8UC4(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g,
const PtrStepSz<unsigned int> image_sum_b,
const PtrStepSz<unsigned int> image_sum_a,
unsigned int templ_sum_r,
unsigned int templ_sum_g,
unsigned int templ_sum_b,
unsigned int templ_sum_a,
PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8U(
int w, int h, const PtrStepSz<unsigned int> image_sum,
const PtrStepSz<unsigned long long> image_sqsum,
unsigned int templ_sum, unsigned long long templ_sqsum,
PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8UC2(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8UC3(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8UC4(
int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
PtrStepSzf result, cudaStream_t stream);
void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
}
}}}
using namespace ::cv::gpu::cudev::match_template;
namespace
{
// Evaluates optimal template's area threshold. If
// template's area is less than the threshold, we use naive match
// template version, otherwise FFT-based (if available)
int getTemplateThreshold(int method, int depth)
{
switch (method)
{
case cv::TM_CCORR:
if (depth == CV_32F) return 250;
if (depth == CV_8U) return 300;
break;
case cv::TM_SQDIFF:
if (depth == CV_8U) return 300;
break;
}
CV_Error(cv::Error::StsBadArg, "getTemplateThreshold: unsupported match template mode");
return 0;
}
void matchTemplate_CCORR_32F(
const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
{
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_32F))
{
matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
return;
}
ConvolveBuf convolve_buf;
convolve_buf.user_block_size = buf.user_block_size;
if (image.channels() == 1)
convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream);
else
{
GpuMat result_;
convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream);
extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
}
}
void matchTemplate_CCORR_8U(
const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
{
if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_8U))
{
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
return;
}
if (stream)
{
stream.enqueueConvert(image, buf.imagef, CV_32F);
stream.enqueueConvert(templ, buf.templf, CV_32F);
}
else
{
image.convertTo(buf.imagef, CV_32F);
templ.convertTo(buf.templf, CV_32F);
}
matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
}
void matchTemplate_CCORR_NORMED_8U(
const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
{
matchTemplate_CCORR_8U(image, templ, result, buf, stream);
buf.image_sqsums.resize(1);
sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
}
void matchTemplate_SQDIFF_32F(
const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
{
(void)buf;
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
}
void matchTemplate_SQDIFF_8U(
const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
{
if (templ.size().area() < getTemplateThreshold(cv::TM_SQDIFF, CV_8U))
{
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
return;
}
buf.image_sqsums.resize(1);
sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
matchTemplate_CCORR_8U(image, templ, result, buf, stream);
matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
}
void matchTemplate_SQDIFF_NORMED_8U(
const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
{
buf.image_sqsums.resize(1);
sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
matchTemplate_CCORR_8U(image, templ, result, buf, stream);
matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
}
void matchTemplate_CCOFF_8U(
const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
{
matchTemplate_CCORR_8U(image, templ, result, buf, stream);
if (image.channels() == 1)
{
buf.image_sums.resize(1);
integral(image, buf.image_sums[0], stream);
unsigned int templ_sum = (unsigned int)sum(templ)[0];
matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
}
else
{
split(image, buf.images);
buf.image_sums.resize(buf.images.size());
for (int i = 0; i < image.channels(); ++i)
integral(buf.images[i], buf.image_sums[i], stream);
Scalar templ_sum = sum(templ);
switch (image.channels())
{
case 2:
matchTemplatePrepared_CCOFF_8UC2(
templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1],
(unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
result, StreamAccessor::getStream(stream));
break;
case 3:
matchTemplatePrepared_CCOFF_8UC3(
templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2],
(unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
result, StreamAccessor::getStream(stream));
break;
case 4:
matchTemplatePrepared_CCOFF_8UC4(
templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2], buf.image_sums[3],
(unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
(unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
break;
default:
CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
}
}
}
void matchTemplate_CCOFF_NORMED_8U(
const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
{
if (stream)
{
stream.enqueueConvert(image, buf.imagef, CV_32F);
stream.enqueueConvert(templ, buf.templf, CV_32F);
}
else
{
image.convertTo(buf.imagef, CV_32F);
templ.convertTo(buf.templf, CV_32F);
}
matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
if (image.channels() == 1)
{
buf.image_sums.resize(1);
integral(image, buf.image_sums[0], stream);
buf.image_sqsums.resize(1);
sqrIntegral(image, buf.image_sqsums[0], stream);
unsigned int templ_sum = (unsigned int)sum(templ)[0];
unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ)[0];
matchTemplatePrepared_CCOFF_NORMED_8U(
templ.cols, templ.rows, buf.image_sums[0], buf.image_sqsums[0],
templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
}
else
{
split(image, buf.images);
buf.image_sums.resize(buf.images.size());
buf.image_sqsums.resize(buf.images.size());
for (int i = 0; i < image.channels(); ++i)
{
integral(buf.images[i], buf.image_sums[i], stream);
sqrIntegral(buf.images[i], buf.image_sqsums[i], stream);
}
Scalar templ_sum = sum(templ);
Scalar templ_sqsum = sqrSum(templ);
switch (image.channels())
{
case 2:
matchTemplatePrepared_CCOFF_NORMED_8UC2(
templ.cols, templ.rows,
buf.image_sums[0], buf.image_sqsums[0],
buf.image_sums[1], buf.image_sqsums[1],
(unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
(unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
result, StreamAccessor::getStream(stream));
break;
case 3:
matchTemplatePrepared_CCOFF_NORMED_8UC3(
templ.cols, templ.rows,
buf.image_sums[0], buf.image_sqsums[0],
buf.image_sums[1], buf.image_sqsums[1],
buf.image_sums[2], buf.image_sqsums[2],
(unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
(unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
(unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
result, StreamAccessor::getStream(stream));
break;
case 4:
matchTemplatePrepared_CCOFF_NORMED_8UC4(
templ.cols, templ.rows,
buf.image_sums[0], buf.image_sqsums[0],
buf.image_sums[1], buf.image_sqsums[1],
buf.image_sums[2], buf.image_sqsums[2],
buf.image_sums[3], buf.image_sqsums[3],
(unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
(unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
(unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
(unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
result, StreamAccessor::getStream(stream));
break;
default:
CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
}
}
}
}
void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
{
MatchTemplateBuf buf;
matchTemplate(image, templ, result, method, buf, stream);
}
void cv::gpu::matchTemplate(
const GpuMat& image, const GpuMat& templ, GpuMat& result, int method,
MatchTemplateBuf &buf, Stream& stream)
{
CV_Assert(image.type() == templ.type());
CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, MatchTemplateBuf&, Stream& stream);
static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U,
::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U,
::matchTemplate_CCOFF_8U, ::matchTemplate_CCOFF_NORMED_8U };
static const Caller callers32F[] = { ::matchTemplate_SQDIFF_32F, 0,
::matchTemplate_CCORR_32F, 0, 0, 0 };
const Caller* callers = 0;
switch (image.depth())
{
case CV_8U: callers = callers8U; break;
case CV_32F: callers = callers32F; break;
default: CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported data type");
}
Caller caller = callers[method];
CV_Assert(caller);
caller(image, templ, result, buf, stream);
}
#endif

View File

@@ -1,387 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
void cv::gpu::meanShiftSegmentation(const GpuMat&, Mat&, int, int, int, TermCriteria) { throw_no_cuda(); }
#else
// Auxiliray stuff
namespace
{
//
// Declarations
//
class DjSets
{
public:
DjSets(int n);
int find(int elem);
int merge(int set1, int set2);
std::vector<int> parent;
std::vector<int> rank;
std::vector<int> size;
private:
DjSets(const DjSets&);
void operator =(const DjSets&);
};
template <typename T>
struct GraphEdge
{
GraphEdge() {}
GraphEdge(int to_, int next_, const T& val_) : to(to_), next(next_), val(val_) {}
int to;
int next;
T val;
};
template <typename T>
class Graph
{
public:
typedef GraphEdge<T> Edge;
Graph(int numv, int nume_max);
void addEdge(int from, int to, const T& val=T());
std::vector<int> start;
std::vector<Edge> edges;
int numv;
int nume_max;
int nume;
private:
Graph(const Graph&);
void operator =(const Graph&);
};
struct SegmLinkVal
{
SegmLinkVal() {}
SegmLinkVal(int dr_, int dsp_) : dr(dr_), dsp(dsp_) {}
bool operator <(const SegmLinkVal& other) const
{
return dr + dsp < other.dr + other.dsp;
}
int dr;
int dsp;
};
struct SegmLink
{
SegmLink() {}
SegmLink(int from_, int to_, const SegmLinkVal& val_)
: from(from_), to(to_), val(val_) {}
bool operator <(const SegmLink& other) const
{
return val < other.val;
}
int from;
int to;
SegmLinkVal val;
};
//
// Implementation
//
DjSets::DjSets(int n) : parent(n), rank(n, 0), size(n, 1)
{
for (int i = 0; i < n; ++i)
parent[i] = i;
}
inline int DjSets::find(int elem)
{
int set = elem;
while (set != parent[set])
set = parent[set];
while (elem != parent[elem])
{
int next = parent[elem];
parent[elem] = set;
elem = next;
}
return set;
}
inline int DjSets::merge(int set1, int set2)
{
if (rank[set1] < rank[set2])
{
parent[set1] = set2;
size[set2] += size[set1];
return set2;
}
if (rank[set2] < rank[set1])
{
parent[set2] = set1;
size[set1] += size[set2];
return set1;
}
parent[set1] = set2;
rank[set2]++;
size[set2] += size[set1];
return set2;
}
template <typename T>
Graph<T>::Graph(int numv_, int nume_max_) : start(numv_, -1), edges(nume_max_)
{
this->numv = numv_;
this->nume_max = nume_max_;
nume = 0;
}
template <typename T>
inline void Graph<T>::addEdge(int from, int to, const T& val)
{
edges[nume] = Edge(to, start[from], val);
start[from] = nume;
nume++;
}
inline int pix(int y, int x, int ncols)
{
return y * ncols + x;
}
inline int sqr(int x)
{
return x * x;
}
inline int dist2(const cv::Vec4b& lhs, const cv::Vec4b& rhs)
{
return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]) + sqr(lhs[2] - rhs[2]);
}
inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
{
return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]);
}
} // anonymous namespace
void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria)
{
CV_Assert(src.type() == CV_8UC4);
const int nrows = src.rows;
const int ncols = src.cols;
const int hr = sr;
const int hsp = sp;
// Perform mean shift procedure and obtain region and spatial maps
GpuMat d_rmap, d_spmap;
meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
Mat rmap(d_rmap);
Mat spmap(d_spmap);
Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
+ (nrows - 1) + (ncols - 1));
// Make region adjacent graph from image
Vec4b r1;
Vec4b r2[4];
Vec2s sp1;
Vec2s sp2[4];
int dr[4];
int dsp[4];
for (int y = 0; y < nrows - 1; ++y)
{
Vec4b* ry = rmap.ptr<Vec4b>(y);
Vec4b* ryp = rmap.ptr<Vec4b>(y + 1);
Vec2s* spy = spmap.ptr<Vec2s>(y);
Vec2s* spyp = spmap.ptr<Vec2s>(y + 1);
for (int x = 0; x < ncols - 1; ++x)
{
r1 = ry[x];
sp1 = spy[x];
r2[0] = ry[x + 1];
r2[1] = ryp[x];
r2[2] = ryp[x + 1];
r2[3] = ryp[x];
sp2[0] = spy[x + 1];
sp2[1] = spyp[x];
sp2[2] = spyp[x + 1];
sp2[3] = spyp[x];
dr[0] = dist2(r1, r2[0]);
dr[1] = dist2(r1, r2[1]);
dr[2] = dist2(r1, r2[2]);
dsp[0] = dist2(sp1, sp2[0]);
dsp[1] = dist2(sp1, sp2[1]);
dsp[2] = dist2(sp1, sp2[2]);
r1 = ry[x + 1];
sp1 = spy[x + 1];
dr[3] = dist2(r1, r2[3]);
dsp[3] = dist2(sp1, sp2[3]);
g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1]));
g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2]));
g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3]));
}
}
for (int y = 0; y < nrows - 1; ++y)
{
r1 = rmap.at<Vec4b>(y, ncols - 1);
r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1);
sp1 = spmap.at<Vec2s>(y, ncols - 1);
sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1);
dr[0] = dist2(r1, r2[0]);
dsp[0] = dist2(sp1, sp2[0]);
g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0]));
}
for (int x = 0; x < ncols - 1; ++x)
{
r1 = rmap.at<Vec4b>(nrows - 1, x);
r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1);
sp1 = spmap.at<Vec2s>(nrows - 1, x);
sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1);
dr[0] = dist2(r1, r2[0]);
dsp[0] = dist2(sp1, sp2[0]);
g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
}
DjSets comps(g.numv);
// Find adjacent components
for (int v = 0; v < g.numv; ++v)
{
for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
{
int c1 = comps.find(v);
int c2 = comps.find(g.edges[e_it].to);
if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp)
comps.merge(c1, c2);
}
}
std::vector<SegmLink> edges;
edges.reserve(g.numv);
// Prepare edges connecting differnet components
for (int v = 0; v < g.numv; ++v)
{
int c1 = comps.find(v);
for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
{
int c2 = comps.find(g.edges[e_it].to);
if (c1 != c2)
edges.push_back(SegmLink(c1, c2, g.edges[e_it].val));
}
}
// Sort all graph's edges connecting differnet components (in asceding order)
sort(edges.begin(), edges.end());
// Exclude small components (starting from the nearest couple)
for (size_t i = 0; i < edges.size(); ++i)
{
int c1 = comps.find(edges[i].from);
int c2 = comps.find(edges[i].to);
if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize))
comps.merge(c1, c2);
}
// Compute sum of the pixel's colors which are in the same segment
Mat h_src(src);
std::vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
for (int y = 0; y < nrows; ++y)
{
Vec4b* h_srcy = h_src.ptr<Vec4b>(y);
for (int x = 0; x < ncols; ++x)
{
int parent = comps.find(pix(y, x, ncols));
Vec4b col = h_srcy[x];
Vec4i& sumcol = sumcols[parent];
sumcol[0] += col[0];
sumcol[1] += col[1];
sumcol[2] += col[2];
}
}
// Create final image, color of each segment is the average color of its pixels
dst.create(src.size(), src.type());
for (int y = 0; y < nrows; ++y)
{
Vec4b* dsty = dst.ptr<Vec4b>(y);
for (int x = 0; x < ncols; ++x)
{
int parent = comps.find(pix(y, x, ncols));
const Vec4i& sumcol = sumcols[parent];
Vec4b& dstcol = dsty[x];
dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
dstcol[3] = 255;
}
}
}
#endif // #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

View File

@@ -1,249 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::ImagePyramid::build(const GpuMat&, int, Stream&) { throw_no_cuda(); }
void cv::gpu::ImagePyramid::getLayer(GpuMat&, Size, Stream&) const { throw_no_cuda(); }
#else // HAVE_CUDA
//////////////////////////////////////////////////////////////////////////////
// pyrDown
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
}}}
void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
{
using namespace cv::gpu::cudev::imgproc;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[6][4] =
{
{pyrDown_gpu<uchar> , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3> , pyrDown_gpu<uchar4> },
{0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
{pyrDown_gpu<ushort> , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3> , pyrDown_gpu<ushort4> },
{pyrDown_gpu<short> , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3> , pyrDown_gpu<short4> },
{0 /*pyrDown_gpu<int>*/ , 0 /*pyrDown_gpu<int2>*/ , 0 /*pyrDown_gpu<int3>*/ , 0 /*pyrDown_gpu<int4>*/ },
{pyrDown_gpu<float> , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3> , pyrDown_gpu<float4> }
};
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
const func_t func = funcs[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
func(src, dst, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
// pyrUp
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
}}}
void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
{
using namespace cv::gpu::cudev::imgproc;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[6][4] =
{
{pyrUp_gpu<uchar> , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3> , pyrUp_gpu<uchar4> },
{0 /*pyrUp_gpu<schar>*/, 0 /*pyrUp_gpu<schar2>*/ , 0 /*pyrUp_gpu<schar3>*/, 0 /*pyrUp_gpu<schar4>*/},
{pyrUp_gpu<ushort> , 0 /*pyrUp_gpu<ushort2>*/, pyrUp_gpu<ushort3> , pyrUp_gpu<ushort4> },
{pyrUp_gpu<short> , 0 /*pyrUp_gpu<short2>*/ , pyrUp_gpu<short3> , pyrUp_gpu<short4> },
{0 /*pyrUp_gpu<int>*/ , 0 /*pyrUp_gpu<int2>*/ , 0 /*pyrUp_gpu<int3>*/ , 0 /*pyrUp_gpu<int4>*/ },
{pyrUp_gpu<float> , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3> , pyrUp_gpu<float4> }
};
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
const func_t func = funcs[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
dst.create(src.rows * 2, src.cols * 2, src.type());
func(src, dst, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
// ImagePyramid
namespace cv { namespace gpu { namespace cudev
{
namespace pyramid
{
template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
}}}
void cv::gpu::ImagePyramid::build(const GpuMat& img, int numLayers, Stream& stream)
{
using namespace cv::gpu::cudev::pyramid;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[6][4] =
{
{kernelDownsampleX2_gpu<uchar1> , 0 /*kernelDownsampleX2_gpu<uchar2>*/ , kernelDownsampleX2_gpu<uchar3> , kernelDownsampleX2_gpu<uchar4> },
{0 /*kernelDownsampleX2_gpu<char1>*/ , 0 /*kernelDownsampleX2_gpu<char2>*/ , 0 /*kernelDownsampleX2_gpu<char3>*/ , 0 /*kernelDownsampleX2_gpu<char4>*/ },
{kernelDownsampleX2_gpu<ushort1> , 0 /*kernelDownsampleX2_gpu<ushort2>*/, kernelDownsampleX2_gpu<ushort3> , kernelDownsampleX2_gpu<ushort4> },
{0 /*kernelDownsampleX2_gpu<short1>*/ , 0 /*kernelDownsampleX2_gpu<short2>*/ , 0 /*kernelDownsampleX2_gpu<short3>*/, 0 /*kernelDownsampleX2_gpu<short4>*/},
{0 /*kernelDownsampleX2_gpu<int1>*/ , 0 /*kernelDownsampleX2_gpu<int2>*/ , 0 /*kernelDownsampleX2_gpu<int3>*/ , 0 /*kernelDownsampleX2_gpu<int4>*/ },
{kernelDownsampleX2_gpu<float1> , 0 /*kernelDownsampleX2_gpu<float2>*/ , kernelDownsampleX2_gpu<float3> , kernelDownsampleX2_gpu<float4> }
};
CV_Assert(img.depth() <= CV_32F && img.channels() <= 4);
const func_t func = funcs[img.depth()][img.channels() - 1];
CV_Assert(func != 0);
layer0_ = img;
Size szLastLayer = img.size();
nLayers_ = 1;
if (numLayers <= 0)
numLayers = 255; //it will cut-off when any of the dimensions goes 1
pyramid_.resize(numLayers);
for (int i = 0; i < numLayers - 1; ++i)
{
Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
if (szCurLayer.width == 0 || szCurLayer.height == 0)
break;
ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
nLayers_++;
const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
func(prevLayer, pyramid_[i], StreamAccessor::getStream(stream));
szLastLayer = szCurLayer;
}
}
void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream) const
{
using namespace cv::gpu::cudev::pyramid;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[6][4] =
{
{kernelInterpolateFrom1_gpu<uchar1> , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3> , kernelInterpolateFrom1_gpu<uchar4> },
{0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/ , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ },
{kernelInterpolateFrom1_gpu<ushort1> , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3> , kernelInterpolateFrom1_gpu<ushort4> },
{0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/},
{0 /*kernelInterpolateFrom1_gpu<int1>*/ , 0 /*kernelInterpolateFrom1_gpu<int2>*/ , 0 /*kernelInterpolateFrom1_gpu<int3>*/ , 0 /*kernelInterpolateFrom1_gpu<int4>*/ },
{kernelInterpolateFrom1_gpu<float1> , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3> , kernelInterpolateFrom1_gpu<float4> }
};
CV_Assert(outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0);
ensureSizeIsEnough(outRoi, layer0_.type(), outImg);
const func_t func = funcs[outImg.depth()][outImg.channels() - 1];
CV_Assert(func != 0);
if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
{
if (stream)
stream.enqueueCopy(layer0_, outImg);
else
layer0_.copyTo(outImg);
}
float lastScale = 1.0f;
float curScale;
GpuMat lastLayer = layer0_;
GpuMat curLayer;
for (int i = 0; i < nLayers_ - 1; ++i)
{
curScale = lastScale * 0.5f;
curLayer = pyramid_[i];
if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
{
if (stream)
stream.enqueueCopy(curLayer, outImg);
else
curLayer.copyTo(outImg);
}
if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
break;
lastScale = curScale;
lastLayer = curLayer;
}
func(lastLayer, outImg, StreamAccessor::getStream(stream));
}
#endif // HAVE_CUDA

View File

@@ -1,102 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, Scalar, Stream&){ throw_no_cuda(); }
#else // HAVE_CUDA
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename T>
void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst,
int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
}
}}}
void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, Scalar borderValue, Stream& stream)
{
using namespace cv::gpu::cudev::imgproc;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
static const func_t funcs[6][4] =
{
{remap_gpu<uchar> , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3> , remap_gpu<uchar4> },
{0 /*remap_gpu<schar>*/, 0 /*remap_gpu<char2>*/ , 0 /*remap_gpu<char3>*/, 0 /*remap_gpu<char4>*/},
{remap_gpu<ushort> , 0 /*remap_gpu<ushort2>*/, remap_gpu<ushort3> , remap_gpu<ushort4> },
{remap_gpu<short> , 0 /*remap_gpu<short2>*/ , remap_gpu<short3> , remap_gpu<short4> },
{0 /*remap_gpu<int>*/ , 0 /*remap_gpu<int2>*/ , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
{remap_gpu<float> , 0 /*remap_gpu<float2>*/ , remap_gpu<float3> , remap_gpu<float4> }
};
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
const func_t func = funcs[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
dst.create(xmap.size(), src.type());
Scalar_<float> borderValueFloat;
borderValueFloat = borderValue;
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap,
dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
}
#endif // HAVE_CUDA

View File

@@ -1,162 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
{
(void)src;
(void)dst;
(void)dsize;
(void)fx;
(void)fy;
(void)interpolation;
(void)s;
throw_no_cuda();
}
#else // HAVE_CUDA
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename T>
void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
PtrStepSzb dst, int interpolation, cudaStream_t stream);
}
}}}
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
{
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR
|| interpolation == INTER_CUBIC || interpolation == INTER_AREA);
CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
if (dsize == Size())
dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
else
{
fx = static_cast<double>(dsize.width) / src.cols;
fy = static_cast<double>(dsize.height) / src.rows;
}
if (dsize != dst.size())
dst.create(dsize, src.type());
if (dsize == src.size())
{
if (s)
s.enqueueCopy(src, dst);
else
src.copyTo(dst);
return;
}
cudaStream_t stream = StreamAccessor::getStream(s);
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4);
useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR);
if (useNpp)
{
typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize,
double xFactor, double yFactor, int eInterpolation);
const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R };
static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS};
NppiSize srcsz;
srcsz.width = wholeSize.width;
srcsz.height = wholeSize.height;
NppiRect srcrect;
srcrect.x = ofs.x;
srcrect.y = ofs.y;
srcrect.width = src.cols;
srcrect.height = src.rows;
NppiSize dstsz;
dstsz.width = dst.cols;
dstsz.height = dst.rows;
NppStreamHandler h(stream);
nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect,
dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
else
{
using namespace ::cv::gpu::cudev::imgproc;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
static const func_t funcs[6][4] =
{
{resize_gpu<uchar> , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3> , resize_gpu<uchar4> },
{0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/ , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/},
{resize_gpu<ushort> , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3> , resize_gpu<ushort4> },
{resize_gpu<short> , 0 /*resize_gpu<short2>*/ , resize_gpu<short3> , resize_gpu<short4> },
{0 /*resize_gpu<int>*/ , 0 /*resize_gpu<int2>*/ , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ },
{resize_gpu<float> , 0 /*resize_gpu<float2>*/ , resize_gpu<float3> , resize_gpu<float4> }
};
const func_t func = funcs[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream);
}
}
#endif // HAVE_CUDA

View File

@@ -1,454 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
void cv::gpu::buildWarpAffineMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
void cv::gpu::buildWarpPerspectiveMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
#else // HAVE_CUDA
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
template <typename T>
void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
template <typename T>
void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
}
}}}
void cv::gpu::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
{
using namespace cv::gpu::cudev::imgproc;
CV_Assert(M.rows == 2 && M.cols == 3);
xmap.create(dsize, CV_32FC1);
ymap.create(dsize, CV_32FC1);
float coeffs[2 * 3];
Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
if (inverse)
M.convertTo(coeffsMat, coeffsMat.type());
else
{
cv::Mat iM;
invertAffineTransform(M, iM);
iM.convertTo(coeffsMat, coeffsMat.type());
}
buildWarpAffineMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
}
void cv::gpu::buildWarpPerspectiveMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
{
using namespace cv::gpu::cudev::imgproc;
CV_Assert(M.rows == 3 && M.cols == 3);
xmap.create(dsize, CV_32FC1);
ymap.create(dsize, CV_32FC1);
float coeffs[3 * 3];
Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
if (inverse)
M.convertTo(coeffsMat, coeffsMat.type());
else
{
cv::Mat iM;
invert(M, iM);
iM.convertTo(coeffsMat, coeffsMat.type());
}
buildWarpPerspectiveMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
}
namespace
{
template<int DEPTH> struct NppTypeTraits;
template<> struct NppTypeTraits<CV_8U> { typedef Npp8u npp_t; };
template<> struct NppTypeTraits<CV_8S> { typedef Npp8s npp_t; };
template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; };
template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; };
template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };
template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };
template <int DEPTH> struct NppWarpFunc
{
typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_t* pDst,
int dstStep, NppiRect dstRoi, const double coeffs[][3],
int interpolation);
};
template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp
{
typedef typename NppWarpFunc<DEPTH>::npp_t npp_t;
static void call(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
{
static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
NppiSize srcsz;
srcsz.height = src.rows;
srcsz.width = src.cols;
NppiRect srcroi;
srcroi.x = 0;
srcroi.y = 0;
srcroi.height = src.rows;
srcroi.width = src.cols;
NppiRect dstroi;
dstroi.x = 0;
dstroi.y = 0;
dstroi.height = dst.rows;
dstroi.width = dst.cols;
cv::gpu::NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi,
coeffs, npp_inter[interpolation]) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
}
void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
{
CV_Assert(M.rows == 2 && M.cols == 3);
int interpolation = flags & INTER_MAX;
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
dst.create(dsize, src.type());
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
static const bool useNppTab[6][4][3] =
{
{
{false, false, true},
{false, false, false},
{false, true, true},
{false, false, false}
},
{
{false, false, false},
{false, false, false},
{false, false, false},
{false, false, false}
},
{
{false, true, true},
{false, false, false},
{false, true, true},
{false, false, false}
},
{
{false, false, false},
{false, false, false},
{false, false, false},
{false, false, false}
},
{
{false, true, true},
{false, false, false},
{false, true, true},
{false, false, true}
},
{
{false, true, true},
{false, false, false},
{false, true, true},
{false, false, true}
}
};
bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
// NPP bug on float data
useNpp = useNpp && src.depth() != CV_32F;
if (useNpp)
{
typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
static const func_t funcs[2][6][4] =
{
{
{NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call},
{0, 0, 0, 0},
{NppWarp<CV_16U, nppiWarpAffine_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R>::call},
{0, 0, 0, 0},
{NppWarp<CV_32S, nppiWarpAffine_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R>::call},
{NppWarp<CV_32F, nppiWarpAffine_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R>::call}
},
{
{NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R>::call},
{0, 0, 0, 0},
{NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R>::call},
{0, 0, 0, 0},
{NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call},
{NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call}
}
};
dst.setTo(borderValue);
double coeffs[2][3];
Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
M.convertTo(coeffsMat, coeffsMat.type());
const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
CV_Assert(func != 0);
func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
}
else
{
using namespace cv::gpu::cudev::imgproc;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
static const func_t funcs[6][4] =
{
{warpAffine_gpu<uchar> , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3> , warpAffine_gpu<uchar4> },
{0 /*warpAffine_gpu<schar>*/, 0 /*warpAffine_gpu<char2>*/ , 0 /*warpAffine_gpu<char3>*/, 0 /*warpAffine_gpu<char4>*/},
{warpAffine_gpu<ushort> , 0 /*warpAffine_gpu<ushort2>*/, warpAffine_gpu<ushort3> , warpAffine_gpu<ushort4> },
{warpAffine_gpu<short> , 0 /*warpAffine_gpu<short2>*/ , warpAffine_gpu<short3> , warpAffine_gpu<short4> },
{0 /*warpAffine_gpu<int>*/ , 0 /*warpAffine_gpu<int2>*/ , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
{warpAffine_gpu<float> , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3> , warpAffine_gpu<float4> }
};
const func_t func = funcs[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
float coeffs[2 * 3];
Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
if (flags & WARP_INVERSE_MAP)
M.convertTo(coeffsMat, coeffsMat.type());
else
{
cv::Mat iM;
invertAffineTransform(M, iM);
iM.convertTo(coeffsMat, coeffsMat.type());
}
Scalar_<float> borderValueFloat;
borderValueFloat = borderValue;
func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
}
}
void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
{
CV_Assert(M.rows == 3 && M.cols == 3);
int interpolation = flags & INTER_MAX;
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
dst.create(dsize, src.type());
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
static const bool useNppTab[6][4][3] =
{
{
{false, false, true},
{false, false, false},
{false, true, true},
{false, false, false}
},
{
{false, false, false},
{false, false, false},
{false, false, false},
{false, false, false}
},
{
{false, true, true},
{false, false, false},
{false, true, true},
{false, false, false}
},
{
{false, false, false},
{false, false, false},
{false, false, false},
{false, false, false}
},
{
{false, true, true},
{false, false, false},
{false, true, true},
{false, false, true}
},
{
{false, true, true},
{false, false, false},
{false, true, true},
{false, false, true}
}
};
bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
// NPP bug on float data
useNpp = useNpp && src.depth() != CV_32F;
if (useNpp)
{
typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
static const func_t funcs[2][6][4] =
{
{
{NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
{0, 0, 0, 0},
{NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
{0, 0, 0, 0},
{NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
{NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
},
{
{NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
{0, 0, 0, 0},
{NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
{0, 0, 0, 0},
{NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
{NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
}
};
dst.setTo(borderValue);
double coeffs[3][3];
Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
M.convertTo(coeffsMat, coeffsMat.type());
const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
CV_Assert(func != 0);
func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
}
else
{
using namespace cv::gpu::cudev::imgproc;
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
static const func_t funcs[6][4] =
{
{warpPerspective_gpu<uchar> , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3> , warpPerspective_gpu<uchar4> },
{0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/ , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
{warpPerspective_gpu<ushort> , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3> , warpPerspective_gpu<ushort4> },
{warpPerspective_gpu<short> , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3> , warpPerspective_gpu<short4> },
{0 /*warpPerspective_gpu<int>*/ , 0 /*warpPerspective_gpu<int2>*/ , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
{warpPerspective_gpu<float> , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3> , warpPerspective_gpu<float4> }
};
const func_t func = funcs[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
float coeffs[3 * 3];
Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
if (flags & WARP_INVERSE_MAP)
M.convertTo(coeffsMat, coeffsMat.type());
else
{
cv::Mat iM;
invert(M, iM);
iM.convertTo(coeffsMat, coeffsMat.type());
}
Scalar_<float> borderValueFloat;
borderValueFloat = borderValue;
func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
}
}
#endif // HAVE_CUDA