Revert "Revert "Merge pull request #836 from jet47:gpu-modules""
This commit is contained in:
199
modules/gpuimgproc/src/cuda/bilateral_filter.cu
Normal file
199
modules/gpuimgproc/src/cuda/bilateral_filter.cu
Normal file
@@ -0,0 +1,199 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/border_interpolate.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
/// Bilateral filtering
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
namespace imgproc
|
||||
{
|
||||
__device__ __forceinline__ float norm_l1(const float& a) { return ::fabs(a); }
|
||||
__device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
|
||||
__device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
|
||||
__device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
|
||||
|
||||
__device__ __forceinline__ float sqr(const float& a) { return a * a; }
|
||||
|
||||
template<typename T, typename B>
|
||||
__global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
|
||||
{
|
||||
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
|
||||
|
||||
int x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
|
||||
if (x >= src.cols || y >= src.rows)
|
||||
return;
|
||||
|
||||
value_type center = saturate_cast<value_type>(src(y, x));
|
||||
|
||||
value_type sum1 = VecTraits<value_type>::all(0);
|
||||
float sum2 = 0;
|
||||
|
||||
int r = ksz / 2;
|
||||
float r2 = (float)(r * r);
|
||||
|
||||
int tx = x - r + ksz;
|
||||
int ty = y - r + ksz;
|
||||
|
||||
if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
|
||||
{
|
||||
for (int cy = y - r; cy < ty; ++cy)
|
||||
for (int cx = x - r; cx < tx; ++cx)
|
||||
{
|
||||
float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
|
||||
if (space2 > r2)
|
||||
continue;
|
||||
|
||||
value_type value = saturate_cast<value_type>(src(cy, cx));
|
||||
|
||||
float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
|
||||
sum1 = sum1 + weight * value;
|
||||
sum2 = sum2 + weight;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int cy = y - r; cy < ty; ++cy)
|
||||
for (int cx = x - r; cx < tx; ++cx)
|
||||
{
|
||||
float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
|
||||
if (space2 > r2)
|
||||
continue;
|
||||
|
||||
value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
|
||||
|
||||
float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
|
||||
|
||||
sum1 = sum1 + weight * value;
|
||||
sum2 = sum2 + weight;
|
||||
}
|
||||
}
|
||||
dst(y, x) = saturate_cast<T>(sum1 / sum2);
|
||||
}
|
||||
|
||||
template<typename T, template <typename> class B>
|
||||
void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
|
||||
{
|
||||
dim3 block (32, 8);
|
||||
dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
|
||||
|
||||
B<T> b(src.rows, src.cols);
|
||||
|
||||
float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
|
||||
float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
|
||||
bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
|
||||
cudaSafeCall ( cudaGetLastError () );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
|
||||
|
||||
static caller_t funcs[] =
|
||||
{
|
||||
bilateral_caller<T, BrdConstant>,
|
||||
bilateral_caller<T, BrdReplicate>,
|
||||
bilateral_caller<T, BrdReflect>,
|
||||
bilateral_caller<T, BrdWrap>,
|
||||
bilateral_caller<T, BrdReflect101>
|
||||
};
|
||||
funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
||||
|
||||
#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
|
||||
template void cv::gpu::cudev::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
|
||||
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
|
||||
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
|
||||
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(short)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(short3)
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(short4)
|
||||
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
|
||||
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(int)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
|
||||
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(float)
|
||||
//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(float3)
|
||||
OCV_INSTANTIATE_BILATERAL_FILTER(float4)
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
121
modules/gpuimgproc/src/cuda/blend.cu
Normal file
121
modules/gpuimgproc/src/cuda/blend.cu
Normal file
@@ -0,0 +1,121 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
namespace blend
|
||||
{
|
||||
template <typename T>
|
||||
__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y < rows && x < cols)
|
||||
{
|
||||
int x_ = x / cn;
|
||||
float w1 = weights1.ptr(y)[x_];
|
||||
float w2 = weights2.ptr(y)[x_];
|
||||
T p1 = img1.ptr(y)[x];
|
||||
T p2 = img2.ptr(y)[x];
|
||||
result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
|
||||
|
||||
blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
|
||||
template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
|
||||
|
||||
|
||||
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
|
||||
const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y < rows && x < cols)
|
||||
{
|
||||
float w1 = weights1.ptr(y)[x];
|
||||
float w2 = weights2.ptr(y)[x];
|
||||
float sum_inv = 1.f / (w1 + w2 + 1e-5f);
|
||||
w1 *= sum_inv;
|
||||
w2 *= sum_inv;
|
||||
uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
|
||||
uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
|
||||
((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
|
||||
p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
|
||||
}
|
||||
}
|
||||
|
||||
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
||||
|
||||
blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
} // namespace blend
|
||||
}}} // namespace cv { namespace gpu { namespace cudev
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
494
modules/gpuimgproc/src/cuda/canny.cu
Normal file
494
modules/gpuimgproc/src/cuda/canny.cu
Normal file
@@ -0,0 +1,494 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <utility>
|
||||
#include <algorithm>//std::swap
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/emulation.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/utility.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::cudev;
|
||||
|
||||
namespace canny
|
||||
{
|
||||
struct L1 : binary_function<int, int, float>
|
||||
{
|
||||
__device__ __forceinline__ float operator ()(int x, int y) const
|
||||
{
|
||||
return ::abs(x) + ::abs(y);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ L1() {}
|
||||
__host__ __device__ __forceinline__ L1(const L1&) {}
|
||||
};
|
||||
struct L2 : binary_function<int, int, float>
|
||||
{
|
||||
__device__ __forceinline__ float operator ()(int x, int y) const
|
||||
{
|
||||
return ::sqrtf(x * x + y * y);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ L2() {}
|
||||
__host__ __device__ __forceinline__ L2(const L2&) {}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace canny
|
||||
{
|
||||
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
struct SrcTex
|
||||
{
|
||||
int xoff;
|
||||
int yoff;
|
||||
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
|
||||
|
||||
__device__ __forceinline__ int operator ()(int y, int x) const
|
||||
{
|
||||
return tex2D(tex_src, x + xoff, y + yoff);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Norm> __global__
|
||||
void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (y >= mag.rows || x >= mag.cols)
|
||||
return;
|
||||
|
||||
int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
|
||||
int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
|
||||
|
||||
dx(y, x) = dxVal;
|
||||
dy(y, x) = dyVal;
|
||||
|
||||
mag(y, x) = norm(dxVal, dyVal);
|
||||
}
|
||||
|
||||
void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
|
||||
|
||||
bindTexture(&tex_src, srcWhole);
|
||||
SrcTex src(xoff, yoff);
|
||||
|
||||
if (L2Grad)
|
||||
{
|
||||
L2 norm;
|
||||
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
|
||||
}
|
||||
else
|
||||
{
|
||||
L1 norm;
|
||||
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
|
||||
{
|
||||
if (L2Grad)
|
||||
{
|
||||
L2 norm;
|
||||
transform(dx, dy, mag, norm, WithOutMask(), 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
L1 norm;
|
||||
transform(dx, dy, mag, norm, WithOutMask(), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace canny
|
||||
{
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
__global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
|
||||
{
|
||||
const int CANNY_SHIFT = 15;
|
||||
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
|
||||
return;
|
||||
|
||||
int dxVal = dx(y, x);
|
||||
int dyVal = dy(y, x);
|
||||
|
||||
const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
|
||||
const float m = tex2D(tex_mag, x, y);
|
||||
|
||||
dxVal = ::abs(dxVal);
|
||||
dyVal = ::abs(dyVal);
|
||||
|
||||
// 0 - the pixel can not belong to an edge
|
||||
// 1 - the pixel might belong to an edge
|
||||
// 2 - the pixel does belong to an edge
|
||||
int edge_type = 0;
|
||||
|
||||
if (m > low_thresh)
|
||||
{
|
||||
const int tg22x = dxVal * TG22;
|
||||
const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
|
||||
|
||||
dyVal <<= CANNY_SHIFT;
|
||||
|
||||
if (dyVal < tg22x)
|
||||
{
|
||||
if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else if(dyVal > tg67x)
|
||||
{
|
||||
if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
}
|
||||
|
||||
map(y, x) = edge_type;
|
||||
}
|
||||
|
||||
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
|
||||
|
||||
bindTexture(&tex_mag, mag);
|
||||
|
||||
calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace canny
|
||||
{
|
||||
__device__ int counter = 0;
|
||||
|
||||
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
|
||||
{
|
||||
__shared__ volatile int smem[18][18];
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
|
||||
if (threadIdx.y == 0)
|
||||
smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
|
||||
if (threadIdx.y == blockDim.y - 1)
|
||||
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
|
||||
if (threadIdx.x == 0)
|
||||
smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
|
||||
if (threadIdx.x == blockDim.x - 1)
|
||||
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
|
||||
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
|
||||
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
|
||||
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
|
||||
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
|
||||
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
|
||||
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (x >= map.cols || y >= map.rows)
|
||||
return;
|
||||
|
||||
int n;
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 16; ++k)
|
||||
{
|
||||
n = 0;
|
||||
|
||||
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
|
||||
}
|
||||
|
||||
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
map(y, x) = e;
|
||||
|
||||
n = 0;
|
||||
|
||||
if (e == 2)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
{
|
||||
const int ind = ::atomicAdd(&counter, 1);
|
||||
st[ind] = make_ushort2(x, y);
|
||||
}
|
||||
}
|
||||
|
||||
void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
|
||||
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
|
||||
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
|
||||
|
||||
edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace canny
|
||||
{
|
||||
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
|
||||
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
|
||||
|
||||
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
|
||||
{
|
||||
const int stack_size = 512;
|
||||
|
||||
__shared__ int s_counter;
|
||||
__shared__ int s_ind;
|
||||
__shared__ ushort2 s_st[stack_size];
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int ind = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
if (ind >= count)
|
||||
return;
|
||||
|
||||
ushort2 pos = st1[ind];
|
||||
|
||||
if (threadIdx.x < 8)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x];
|
||||
pos.y += c_dy[threadIdx.x];
|
||||
|
||||
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
|
||||
{
|
||||
map(pos.y, pos.x) = 2;
|
||||
|
||||
ind = Emulation::smem::atomicAdd(&s_counter, 1);
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
|
||||
{
|
||||
const int subTaskIdx = threadIdx.x >> 3;
|
||||
const int portion = ::min(s_counter, blockDim.x >> 3);
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
pos = s_st[s_counter - 1 - subTaskIdx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter -= portion;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x & 7];
|
||||
pos.y += c_dy[threadIdx.x & 7];
|
||||
|
||||
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
|
||||
{
|
||||
map(pos.y, pos.x) = 2;
|
||||
|
||||
ind = Emulation::smem::atomicAdd(&s_counter, 1);
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (s_counter > 0)
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
ind = ::atomicAdd(&counter, s_counter);
|
||||
s_ind = ind - s_counter;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
ind = s_ind;
|
||||
|
||||
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
|
||||
st2[ind + i] = s_st[i];
|
||||
}
|
||||
}
|
||||
|
||||
void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
|
||||
|
||||
int count;
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
while (count > 0)
|
||||
{
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
|
||||
|
||||
const dim3 block(128);
|
||||
const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
|
||||
|
||||
edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
std::swap(st1, st2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace canny
|
||||
{
|
||||
struct GetEdges : unary_function<int, uchar>
|
||||
{
|
||||
__device__ __forceinline__ uchar operator ()(int e) const
|
||||
{
|
||||
return (uchar)(-(e >> 1));
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ GetEdges() {}
|
||||
__host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace canny
|
||||
{
|
||||
void getEdges(PtrStepSzi map, PtrStepSzb dst)
|
||||
{
|
||||
transform(map, dst, GetEdges(), WithOutMask(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
186
modules/gpuimgproc/src/cuda/clahe.cu
Normal file
186
modules/gpuimgproc/src/cuda/clahe.cu
Normal file
@@ -0,0 +1,186 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/emulation.hpp"
|
||||
#include "opencv2/core/cuda/scan.hpp"
|
||||
#include "opencv2/core/cuda/reduce.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::cudev;
|
||||
|
||||
namespace clahe
|
||||
{
|
||||
__global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
|
||||
const int2 tileSize, const int tilesX,
|
||||
const int clipLimit, const float lutScale)
|
||||
{
|
||||
__shared__ int smem[512];
|
||||
|
||||
const int tx = blockIdx.x;
|
||||
const int ty = blockIdx.y;
|
||||
const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
smem[tid] = 0;
|
||||
__syncthreads();
|
||||
|
||||
for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
|
||||
{
|
||||
const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
|
||||
for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
|
||||
{
|
||||
const int data = srcPtr[j];
|
||||
Emulation::smem::atomicAdd(&smem[data], 1);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int tHistVal = smem[tid];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (clipLimit > 0)
|
||||
{
|
||||
// clip histogram bar
|
||||
|
||||
int clipped = 0;
|
||||
if (tHistVal > clipLimit)
|
||||
{
|
||||
clipped = tHistVal - clipLimit;
|
||||
tHistVal = clipLimit;
|
||||
}
|
||||
|
||||
// find number of overall clipped samples
|
||||
|
||||
reduce<256>(smem, clipped, tid, plus<int>());
|
||||
|
||||
// broadcast evaluated value
|
||||
|
||||
__shared__ int totalClipped;
|
||||
|
||||
if (tid == 0)
|
||||
totalClipped = clipped;
|
||||
__syncthreads();
|
||||
|
||||
// redistribute clipped samples evenly
|
||||
|
||||
int redistBatch = totalClipped / 256;
|
||||
tHistVal += redistBatch;
|
||||
|
||||
int residual = totalClipped - redistBatch * 256;
|
||||
if (tid < residual)
|
||||
++tHistVal;
|
||||
}
|
||||
|
||||
const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
|
||||
|
||||
lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
|
||||
}
|
||||
|
||||
void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(tilesX, tilesY);
|
||||
|
||||
calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
__global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x >= src.cols || y >= src.rows)
|
||||
return;
|
||||
|
||||
const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
|
||||
int ty1 = __float2int_rd(tyf);
|
||||
int ty2 = ty1 + 1;
|
||||
const float ya = tyf - ty1;
|
||||
ty1 = ::max(ty1, 0);
|
||||
ty2 = ::min(ty2, tilesY - 1);
|
||||
|
||||
const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
|
||||
int tx1 = __float2int_rd(txf);
|
||||
int tx2 = tx1 + 1;
|
||||
const float xa = txf - tx1;
|
||||
tx1 = ::max(tx1, 0);
|
||||
tx2 = ::min(tx2, tilesX - 1);
|
||||
|
||||
const int srcVal = src(y, x);
|
||||
|
||||
float res = 0;
|
||||
|
||||
res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
|
||||
res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
|
||||
res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
|
||||
res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
|
||||
|
||||
dst(y, x) = saturate_cast<uchar>(res);
|
||||
}
|
||||
|
||||
void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) );
|
||||
|
||||
tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CUDA_DISABLER
|
||||
461
modules/gpuimgproc/src/cuda/color.cu
Normal file
461
modules/gpuimgproc/src/cuda/color.cu
Normal file
@@ -0,0 +1,461 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
#include "opencv2/core/cuda/color.hpp"
|
||||
#include "cvt_color_internal.h"
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_x = 8 };
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
|
||||
{
|
||||
enum { smart_block_dim_y = 8 };
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
|
||||
void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
|
||||
{ \
|
||||
traits::functor_type functor = traits::create_functor(); \
|
||||
typedef typename traits::functor_type::argument_type src_t; \
|
||||
typedef typename traits::functor_type::result_type dst_t; \
|
||||
cv::gpu::cudev::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
|
||||
}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
|
||||
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
|
||||
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
|
||||
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
|
||||
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
|
||||
}}} // namespace cv { namespace gpu { namespace cudev
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
274
modules/gpuimgproc/src/cuda/corners.cu
Normal file
274
modules/gpuimgproc/src/cuda/corners.cu
Normal file
@@ -0,0 +1,274 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/border_interpolate.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
namespace imgproc
|
||||
{
|
||||
/////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
|
||||
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
__global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
float a = 0.f;
|
||||
float b = 0.f;
|
||||
float c = 0.f;
|
||||
|
||||
const int ibegin = y - (block_size / 2);
|
||||
const int jbegin = x - (block_size / 2);
|
||||
const int iend = ibegin + block_size;
|
||||
const int jend = jbegin + block_size;
|
||||
|
||||
for (int i = ibegin; i < iend; ++i)
|
||||
{
|
||||
for (int j = jbegin; j < jend; ++j)
|
||||
{
|
||||
float dx = tex2D(harrisDxTex, j, i);
|
||||
float dy = tex2D(harrisDyTex, j, i);
|
||||
|
||||
a += dx * dx;
|
||||
b += dx * dy;
|
||||
c += dy * dy;
|
||||
}
|
||||
}
|
||||
|
||||
dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename BR, typename BC>
|
||||
__global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
float a = 0.f;
|
||||
float b = 0.f;
|
||||
float c = 0.f;
|
||||
|
||||
const int ibegin = y - (block_size / 2);
|
||||
const int jbegin = x - (block_size / 2);
|
||||
const int iend = ibegin + block_size;
|
||||
const int jend = jbegin + block_size;
|
||||
|
||||
for (int i = ibegin; i < iend; ++i)
|
||||
{
|
||||
const int y = border_col.idx_row(i);
|
||||
|
||||
for (int j = jbegin; j < jend; ++j)
|
||||
{
|
||||
const int x = border_row.idx_col(j);
|
||||
|
||||
float dx = tex2D(harrisDxTex, x, y);
|
||||
float dy = tex2D(harrisDyTex, x, y);
|
||||
|
||||
a += dx * dx;
|
||||
b += dx * dy;
|
||||
c += dy * dy;
|
||||
}
|
||||
}
|
||||
|
||||
dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
|
||||
}
|
||||
}
|
||||
|
||||
void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
|
||||
|
||||
bindTexture(&harrisDxTex, Dx);
|
||||
bindTexture(&harrisDyTex, Dy);
|
||||
|
||||
switch (border_type)
|
||||
{
|
||||
case BORDER_REFLECT101:
|
||||
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
|
||||
break;
|
||||
|
||||
case BORDER_REFLECT:
|
||||
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
|
||||
break;
|
||||
|
||||
case BORDER_REPLICATE:
|
||||
cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
|
||||
break;
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
/////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
|
||||
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
__global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
float a = 0.f;
|
||||
float b = 0.f;
|
||||
float c = 0.f;
|
||||
|
||||
const int ibegin = y - (block_size / 2);
|
||||
const int jbegin = x - (block_size / 2);
|
||||
const int iend = ibegin + block_size;
|
||||
const int jend = jbegin + block_size;
|
||||
|
||||
for (int i = ibegin; i < iend; ++i)
|
||||
{
|
||||
for (int j = jbegin; j < jend; ++j)
|
||||
{
|
||||
float dx = tex2D(minEigenValDxTex, j, i);
|
||||
float dy = tex2D(minEigenValDyTex, j, i);
|
||||
|
||||
a += dx * dx;
|
||||
b += dx * dy;
|
||||
c += dy * dy;
|
||||
}
|
||||
}
|
||||
|
||||
a *= 0.5f;
|
||||
c *= 0.5f;
|
||||
|
||||
dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename BR, typename BC>
|
||||
__global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
float a = 0.f;
|
||||
float b = 0.f;
|
||||
float c = 0.f;
|
||||
|
||||
const int ibegin = y - (block_size / 2);
|
||||
const int jbegin = x - (block_size / 2);
|
||||
const int iend = ibegin + block_size;
|
||||
const int jend = jbegin + block_size;
|
||||
|
||||
for (int i = ibegin; i < iend; ++i)
|
||||
{
|
||||
int y = border_col.idx_row(i);
|
||||
|
||||
for (int j = jbegin; j < jend; ++j)
|
||||
{
|
||||
int x = border_row.idx_col(j);
|
||||
|
||||
float dx = tex2D(minEigenValDxTex, x, y);
|
||||
float dy = tex2D(minEigenValDyTex, x, y);
|
||||
|
||||
a += dx * dx;
|
||||
b += dx * dy;
|
||||
c += dy * dy;
|
||||
}
|
||||
}
|
||||
|
||||
a *= 0.5f;
|
||||
c *= 0.5f;
|
||||
|
||||
dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
|
||||
}
|
||||
}
|
||||
|
||||
void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
|
||||
|
||||
bindTexture(&minEigenValDxTex, Dx);
|
||||
bindTexture(&minEigenValDyTex, Dy);
|
||||
|
||||
switch (border_type)
|
||||
{
|
||||
case BORDER_REFLECT101:
|
||||
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
|
||||
break;
|
||||
|
||||
case BORDER_REFLECT:
|
||||
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
|
||||
break;
|
||||
|
||||
case BORDER_REPLICATE:
|
||||
cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
|
||||
break;
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif
|
||||
544
modules/gpuimgproc/src/cuda/debayer.cu
Normal file
544
modules/gpuimgproc/src/cuda/debayer.cu
Normal file
@@ -0,0 +1,544 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/limits.hpp"
|
||||
#include "opencv2/core/cuda/color.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
template <typename T> struct Bayer2BGR;
|
||||
|
||||
template <> struct Bayer2BGR<uchar>
|
||||
{
|
||||
uchar3 res0;
|
||||
uchar3 res1;
|
||||
uchar3 res2;
|
||||
uchar3 res3;
|
||||
|
||||
__device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
|
||||
{
|
||||
uchar4 patch[3][3];
|
||||
patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
|
||||
patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
|
||||
patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
|
||||
|
||||
patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
|
||||
patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
|
||||
patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
|
||||
|
||||
patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
|
||||
patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
|
||||
patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
|
||||
|
||||
if ((s_y & 1) ^ start_with_green)
|
||||
{
|
||||
const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
|
||||
const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
|
||||
|
||||
const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
|
||||
const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
|
||||
|
||||
const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
|
||||
const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
|
||||
|
||||
const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
|
||||
const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
|
||||
|
||||
if ((s_y & 1) ^ blue_last)
|
||||
{
|
||||
res0.x = t1;
|
||||
res0.y = patch[1][1].x;
|
||||
res0.z = t0;
|
||||
|
||||
res1.x = patch[1][1].y;
|
||||
res1.y = t3;
|
||||
res1.z = t2;
|
||||
|
||||
res2.x = t5;
|
||||
res2.y = patch[1][1].z;
|
||||
res2.z = t4;
|
||||
|
||||
res3.x = patch[1][1].w;
|
||||
res3.y = t7;
|
||||
res3.z = t6;
|
||||
}
|
||||
else
|
||||
{
|
||||
res0.x = t0;
|
||||
res0.y = patch[1][1].x;
|
||||
res0.z = t1;
|
||||
|
||||
res1.x = t2;
|
||||
res1.y = t3;
|
||||
res1.z = patch[1][1].y;
|
||||
|
||||
res2.x = t4;
|
||||
res2.y = patch[1][1].z;
|
||||
res2.z = t5;
|
||||
|
||||
res3.x = t6;
|
||||
res3.y = t7;
|
||||
res3.z = patch[1][1].w;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
|
||||
const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
|
||||
|
||||
const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
|
||||
const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
|
||||
|
||||
const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
|
||||
const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
|
||||
|
||||
const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
|
||||
const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
|
||||
|
||||
if ((s_y & 1) ^ blue_last)
|
||||
{
|
||||
res0.x = patch[1][1].x;
|
||||
res0.y = t1;
|
||||
res0.z = t0;
|
||||
|
||||
res1.x = t3;
|
||||
res1.y = patch[1][1].y;
|
||||
res1.z = t2;
|
||||
|
||||
res2.x = patch[1][1].z;
|
||||
res2.y = t5;
|
||||
res2.z = t4;
|
||||
|
||||
res3.x = t7;
|
||||
res3.y = patch[1][1].w;
|
||||
res3.z = t6;
|
||||
}
|
||||
else
|
||||
{
|
||||
res0.x = t0;
|
||||
res0.y = t1;
|
||||
res0.z = patch[1][1].x;
|
||||
|
||||
res1.x = t2;
|
||||
res1.y = patch[1][1].y;
|
||||
res1.z = t3;
|
||||
|
||||
res2.x = t4;
|
||||
res2.y = t5;
|
||||
res2.z = patch[1][1].z;
|
||||
|
||||
res3.x = t6;
|
||||
res3.y = patch[1][1].w;
|
||||
res3.z = t7;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
|
||||
template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
|
||||
{
|
||||
typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
|
||||
return f(pix);
|
||||
}
|
||||
template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
|
||||
{
|
||||
return pix;
|
||||
}
|
||||
template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
|
||||
{
|
||||
return make_uchar4(pix.x, pix.y, pix.z, 255);
|
||||
}
|
||||
|
||||
template <typename D>
|
||||
__global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
|
||||
{
|
||||
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (s_y >= src.rows || (s_x << 2) >= src.cols)
|
||||
return;
|
||||
|
||||
s_y = ::min(::max(s_y, 1), src.rows - 2);
|
||||
|
||||
Bayer2BGR<uchar> bayer;
|
||||
bayer.apply(src, s_x, s_y, blue_last, start_with_green);
|
||||
|
||||
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
|
||||
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
dst(d_y, d_x) = toDst<D>(bayer.res0);
|
||||
if (d_x + 1 < src.cols)
|
||||
dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
|
||||
if (d_x + 2 < src.cols)
|
||||
dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
|
||||
if (d_x + 3 < src.cols)
|
||||
dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
|
||||
}
|
||||
|
||||
template <> struct Bayer2BGR<ushort>
|
||||
{
|
||||
ushort3 res0;
|
||||
ushort3 res1;
|
||||
|
||||
__device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
|
||||
{
|
||||
ushort2 patch[3][3];
|
||||
patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
|
||||
patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
|
||||
patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
|
||||
|
||||
patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
|
||||
patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
|
||||
patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
|
||||
|
||||
patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
|
||||
patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
|
||||
patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
|
||||
|
||||
if ((s_y & 1) ^ start_with_green)
|
||||
{
|
||||
const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
|
||||
const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
|
||||
|
||||
const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
|
||||
const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
|
||||
|
||||
if ((s_y & 1) ^ blue_last)
|
||||
{
|
||||
res0.x = t1;
|
||||
res0.y = patch[1][1].x;
|
||||
res0.z = t0;
|
||||
|
||||
res1.x = patch[1][1].y;
|
||||
res1.y = t3;
|
||||
res1.z = t2;
|
||||
}
|
||||
else
|
||||
{
|
||||
res0.x = t0;
|
||||
res0.y = patch[1][1].x;
|
||||
res0.z = t1;
|
||||
|
||||
res1.x = t2;
|
||||
res1.y = t3;
|
||||
res1.z = patch[1][1].y;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
|
||||
const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
|
||||
|
||||
const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
|
||||
const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
|
||||
|
||||
if ((s_y & 1) ^ blue_last)
|
||||
{
|
||||
res0.x = patch[1][1].x;
|
||||
res0.y = t1;
|
||||
res0.z = t0;
|
||||
|
||||
res1.x = t3;
|
||||
res1.y = patch[1][1].y;
|
||||
res1.z = t2;
|
||||
}
|
||||
else
|
||||
{
|
||||
res0.x = t0;
|
||||
res0.y = t1;
|
||||
res0.z = patch[1][1].x;
|
||||
|
||||
res1.x = t2;
|
||||
res1.y = patch[1][1].y;
|
||||
res1.z = t3;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
|
||||
template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
|
||||
{
|
||||
typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
|
||||
return f(pix);
|
||||
}
|
||||
template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
|
||||
{
|
||||
return pix;
|
||||
}
|
||||
template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
|
||||
{
|
||||
return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
|
||||
}
|
||||
|
||||
template <typename D>
|
||||
__global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
|
||||
{
|
||||
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (s_y >= src.rows || (s_x << 1) >= src.cols)
|
||||
return;
|
||||
|
||||
s_y = ::min(::max(s_y, 1), src.rows - 2);
|
||||
|
||||
Bayer2BGR<ushort> bayer;
|
||||
bayer.apply(src, s_x, s_y, blue_last, start_with_green);
|
||||
|
||||
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
|
||||
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
dst(d_y, d_x) = toDst<D>(bayer.res0);
|
||||
if (d_x + 1 < src.cols)
|
||||
dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
|
||||
}
|
||||
|
||||
template <int cn>
|
||||
void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
|
||||
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
|
||||
|
||||
Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <int cn>
|
||||
void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<ushort, cn>::vec_type dst_t;
|
||||
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
|
||||
|
||||
Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
|
||||
template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
// Bayer Demosaicing (Malvar, He, and Cutler)
|
||||
//
|
||||
// by Morgan McGuire, Williams College
|
||||
// http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
|
||||
//
|
||||
// ported to CUDA
|
||||
|
||||
texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
template <typename DstType>
|
||||
__global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
|
||||
{
|
||||
const float kAx = -1.0f / 8.0f, kAy = -1.5f / 8.0f, kAz = 0.5f / 8.0f /*kAw = -1.0f / 8.0f*/;
|
||||
const float kBx = 2.0f / 8.0f, /*kBy = 0.0f / 8.0f,*/ /*kBz = 0.0f / 8.0f,*/ kBw = 4.0f / 8.0f ;
|
||||
const float kCx = 4.0f / 8.0f, kCy = 6.0f / 8.0f, kCz = 5.0f / 8.0f /*kCw = 5.0f / 8.0f*/;
|
||||
const float /*kDx = 0.0f / 8.0f,*/ kDy = 2.0f / 8.0f, kDz = -1.0f / 8.0f /*kDw = -1.0f / 8.0f*/;
|
||||
const float kEx = -1.0f / 8.0f, kEy = -1.5f / 8.0f, /*kEz = -1.0f / 8.0f,*/ kEw = 0.5f / 8.0f ;
|
||||
const float kFx = 2.0f / 8.0f, /*kFy = 0.0f / 8.0f,*/ kFz = 4.0f / 8.0f /*kFw = 0.0f / 8.0f*/;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
|
||||
return;
|
||||
|
||||
int2 center;
|
||||
center.x = x + sourceOffset.x;
|
||||
center.y = y + sourceOffset.y;
|
||||
|
||||
int4 xCoord;
|
||||
xCoord.x = center.x - 2;
|
||||
xCoord.y = center.x - 1;
|
||||
xCoord.z = center.x + 1;
|
||||
xCoord.w = center.x + 2;
|
||||
|
||||
int4 yCoord;
|
||||
yCoord.x = center.y - 2;
|
||||
yCoord.y = center.y - 1;
|
||||
yCoord.z = center.y + 1;
|
||||
yCoord.w = center.y + 2;
|
||||
|
||||
float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
|
||||
|
||||
float4 Dvec;
|
||||
Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
|
||||
Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
|
||||
Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
|
||||
Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
|
||||
|
||||
float4 value;
|
||||
value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
|
||||
value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
|
||||
value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
|
||||
value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
|
||||
|
||||
// (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
|
||||
value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
|
||||
value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
|
||||
value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
|
||||
value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
|
||||
|
||||
float4 PATTERN;
|
||||
PATTERN.x = kCx * C;
|
||||
PATTERN.y = kCy * C;
|
||||
PATTERN.z = kCz * C;
|
||||
PATTERN.w = PATTERN.z;
|
||||
|
||||
float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
|
||||
|
||||
// There are five filter patterns (identity, cross, checker,
|
||||
// theta, phi). Precompute the terms from all of them and then
|
||||
// use swizzles to assign to color channels.
|
||||
//
|
||||
// Channel Matches
|
||||
// x cross (e.g., EE G)
|
||||
// y checker (e.g., EE B)
|
||||
// z theta (e.g., EO R)
|
||||
// w phi (e.g., EO B)
|
||||
|
||||
#define A value.x // A0 + A1
|
||||
#define B value.y // B0 + B1
|
||||
#define E value.z // E0 + E1
|
||||
#define F value.w // F0 + F1
|
||||
|
||||
float3 temp;
|
||||
|
||||
// PATTERN.yzw += (kD.yz * D).xyy;
|
||||
temp.x = kDy * D;
|
||||
temp.y = kDz * D;
|
||||
PATTERN.y += temp.x;
|
||||
PATTERN.z += temp.y;
|
||||
PATTERN.w += temp.y;
|
||||
|
||||
// PATTERN += (kA.xyz * A).xyzx;
|
||||
temp.x = kAx * A;
|
||||
temp.y = kAy * A;
|
||||
temp.z = kAz * A;
|
||||
PATTERN.x += temp.x;
|
||||
PATTERN.y += temp.y;
|
||||
PATTERN.z += temp.z;
|
||||
PATTERN.w += temp.x;
|
||||
|
||||
// PATTERN += (kE.xyw * E).xyxz;
|
||||
temp.x = kEx * E;
|
||||
temp.y = kEy * E;
|
||||
temp.z = kEw * E;
|
||||
PATTERN.x += temp.x;
|
||||
PATTERN.y += temp.y;
|
||||
PATTERN.z += temp.x;
|
||||
PATTERN.w += temp.z;
|
||||
|
||||
// PATTERN.xw += kB.xw * B;
|
||||
PATTERN.x += kBx * B;
|
||||
PATTERN.w += kBw * B;
|
||||
|
||||
// PATTERN.xz += kF.xz * F;
|
||||
PATTERN.x += kFx * F;
|
||||
PATTERN.z += kFz * F;
|
||||
|
||||
// Determine which of four types of pixels we are on.
|
||||
int2 alternate;
|
||||
alternate.x = (x + firstRed.x) % 2;
|
||||
alternate.y = (y + firstRed.y) % 2;
|
||||
|
||||
// in BGR sequence;
|
||||
uchar3 pixelColor =
|
||||
(alternate.y == 0) ?
|
||||
((alternate.x == 0) ?
|
||||
make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
|
||||
make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
|
||||
((alternate.x == 0) ?
|
||||
make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
|
||||
make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
|
||||
|
||||
dst(y, x) = toDst<DstType>(pixelColor);
|
||||
}
|
||||
|
||||
template <int cn>
|
||||
void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
|
||||
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
|
||||
bindTexture(&sourceTex, src);
|
||||
|
||||
MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
|
||||
template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
|
||||
template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
143
modules/gpuimgproc/src/cuda/gftt.cu
Normal file
143
modules/gpuimgproc/src/cuda/gftt.cu
Normal file
@@ -0,0 +1,143 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/sort.h>
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/utility.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
namespace gfft
|
||||
{
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
__device__ int g_counter = 0;
|
||||
|
||||
template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
|
||||
{
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
|
||||
{
|
||||
float val = tex2D(eigTex, j, i);
|
||||
|
||||
if (val > threshold)
|
||||
{
|
||||
float maxVal = val;
|
||||
|
||||
maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
|
||||
maxVal = ::fmax(tex2D(eigTex, j , i - 1), maxVal);
|
||||
maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
|
||||
|
||||
maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
|
||||
maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
|
||||
|
||||
maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
|
||||
maxVal = ::fmax(tex2D(eigTex, j , i + 1), maxVal);
|
||||
maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
|
||||
|
||||
if (val == maxVal)
|
||||
{
|
||||
const int ind = ::atomicAdd(&g_counter, 1);
|
||||
|
||||
if (ind < max_count)
|
||||
corners[ind] = make_float2(j, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
|
||||
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
|
||||
|
||||
bindTexture(&eigTex, eig);
|
||||
|
||||
dim3 block(16, 16);
|
||||
dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
|
||||
|
||||
if (mask.data)
|
||||
findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
|
||||
else
|
||||
findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
int count;
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
return std::min(count, max_count);
|
||||
}
|
||||
|
||||
class EigGreater
|
||||
{
|
||||
public:
|
||||
__device__ __forceinline__ bool operator()(float2 a, float2 b) const
|
||||
{
|
||||
return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
|
||||
{
|
||||
bindTexture(&eigTex, eig);
|
||||
|
||||
thrust::device_ptr<float2> ptr(corners);
|
||||
|
||||
thrust::sort(ptr, ptr + count, EigGreater());
|
||||
}
|
||||
} // namespace optical_flow
|
||||
}}}
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
153
modules/gpuimgproc/src/cuda/hist.cu
Normal file
153
modules/gpuimgproc/src/cuda/hist.cu
Normal file
@@ -0,0 +1,153 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/emulation.hpp"
|
||||
#include "opencv2/core/cuda/transform.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::cudev;
|
||||
|
||||
namespace hist
|
||||
{
|
||||
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
|
||||
{
|
||||
__shared__ int shist[256];
|
||||
|
||||
const int y = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
shist[tid] = 0;
|
||||
__syncthreads();
|
||||
|
||||
if (y < rows)
|
||||
{
|
||||
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
|
||||
|
||||
const int cols_4 = cols / 4;
|
||||
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
|
||||
{
|
||||
unsigned int data = rowPtr[x];
|
||||
|
||||
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
|
||||
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
|
||||
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
|
||||
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
|
||||
}
|
||||
|
||||
if (cols % 4 != 0 && threadIdx.x == 0)
|
||||
{
|
||||
for (int x = cols_4 * 4; x < cols; ++x)
|
||||
{
|
||||
unsigned int data = ((const uchar*)rowPtr)[x];
|
||||
Emulation::smem::atomicAdd(&shist[data], 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int histVal = shist[tid];
|
||||
if (histVal > 0)
|
||||
::atomicAdd(hist + tid, histVal);
|
||||
}
|
||||
|
||||
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(src.rows, block.y));
|
||||
|
||||
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace hist
|
||||
{
|
||||
__constant__ int c_lut[256];
|
||||
|
||||
struct EqualizeHist : unary_function<uchar, uchar>
|
||||
{
|
||||
float scale;
|
||||
|
||||
__host__ EqualizeHist(float _scale) : scale(_scale) {}
|
||||
|
||||
__device__ __forceinline__ uchar operator ()(uchar val) const
|
||||
{
|
||||
const int lut = c_lut[val];
|
||||
return __float2int_rn(scale * lut);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
|
||||
{
|
||||
enum { smart_shift = 4 };
|
||||
};
|
||||
}}}
|
||||
|
||||
namespace hist
|
||||
{
|
||||
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
|
||||
{
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
|
||||
else
|
||||
cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
|
||||
|
||||
const float scale = 255.0f / (src.cols * src.rows);
|
||||
|
||||
cudev::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
1709
modules/gpuimgproc/src/cuda/hough.cu
Normal file
1709
modules/gpuimgproc/src/cuda/hough.cu
Normal file
File diff suppressed because it is too large
Load Diff
916
modules/gpuimgproc/src/cuda/match_template.cu
Normal file
916
modules/gpuimgproc/src/cuda/match_template.cu
Normal file
@@ -0,0 +1,916 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
namespace match_template
|
||||
{
|
||||
__device__ __forceinline__ float sum(float v) { return v; }
|
||||
__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
|
||||
__device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
|
||||
__device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
|
||||
|
||||
__device__ __forceinline__ float first(float v) { return v; }
|
||||
__device__ __forceinline__ float first(float2 v) { return v.x; }
|
||||
__device__ __forceinline__ float first(float3 v) { return v.x; }
|
||||
__device__ __forceinline__ float first(float4 v) { return v.x; }
|
||||
|
||||
__device__ __forceinline__ float mul(float a, float b) { return a * b; }
|
||||
__device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
|
||||
__device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
|
||||
__device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
|
||||
|
||||
__device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
|
||||
__device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
|
||||
__device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
|
||||
__device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
|
||||
|
||||
__device__ __forceinline__ float sub(float a, float b) { return a - b; }
|
||||
__device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
|
||||
__device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
|
||||
__device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
|
||||
|
||||
__device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
|
||||
__device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
|
||||
__device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
|
||||
__device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Naive_CCORR
|
||||
|
||||
template <typename T, int cn>
|
||||
__global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type Type;
|
||||
typedef typename TypeVec<float, cn>::vec_type Typef;
|
||||
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
Typef res = VecTraits<Typef>::all(0);
|
||||
|
||||
for (int i = 0; i < h; ++i)
|
||||
{
|
||||
const Type* image_ptr = (const Type*)image.ptr(y + i);
|
||||
const Type* templ_ptr = (const Type*)templ.ptr(i);
|
||||
for (int j = 0; j < w; ++j)
|
||||
res = res + mul(image_ptr[x + j], templ_ptr[j]);
|
||||
}
|
||||
|
||||
result.ptr(y)[x] = sum(res);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int cn>
|
||||
void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(32, 8);
|
||||
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
|
||||
};
|
||||
|
||||
callers[cn](image, templ, result, stream);
|
||||
}
|
||||
|
||||
|
||||
void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
|
||||
};
|
||||
|
||||
callers[cn](image, templ, result, stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Naive_SQDIFF
|
||||
|
||||
template <typename T, int cn>
|
||||
__global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type Type;
|
||||
typedef typename TypeVec<float, cn>::vec_type Typef;
|
||||
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
Typef res = VecTraits<Typef>::all(0);
|
||||
Typef delta;
|
||||
|
||||
for (int i = 0; i < h; ++i)
|
||||
{
|
||||
const Type* image_ptr = (const Type*)image.ptr(y + i);
|
||||
const Type* templ_ptr = (const Type*)templ.ptr(i);
|
||||
for (int j = 0; j < w; ++j)
|
||||
{
|
||||
delta = sub(image_ptr[x + j], templ_ptr[j]);
|
||||
res = res + delta * delta;
|
||||
}
|
||||
}
|
||||
|
||||
result.ptr(y)[x] = sum(res);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int cn>
|
||||
void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(32, 8);
|
||||
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
|
||||
};
|
||||
|
||||
callers[cn](image, templ, result, stream);
|
||||
}
|
||||
|
||||
void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
|
||||
};
|
||||
|
||||
callers[cn](image, templ, result, stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Prepared_SQDIFF
|
||||
|
||||
template <int cn>
|
||||
__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sqsum_ = (float)(
|
||||
(image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
|
||||
(image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
|
||||
float ccorr = result.ptr(y)[x];
|
||||
result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
|
||||
}
|
||||
}
|
||||
|
||||
template <int cn>
|
||||
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(32, 8);
|
||||
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
|
||||
};
|
||||
|
||||
callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Prepared_SQDIFF_NORMED
|
||||
|
||||
// normAcc* are accurate normalization routines which make GPU matchTemplate
|
||||
// consistent with CPU one
|
||||
|
||||
__device__ float normAcc(float num, float denum)
|
||||
{
|
||||
if (::fabs(num) < denum)
|
||||
return num / denum;
|
||||
if (::fabs(num) < denum * 1.125f)
|
||||
return num > 0 ? 1 : -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
__device__ float normAcc_SQDIFF(float num, float denum)
|
||||
{
|
||||
if (::fabs(num) < denum)
|
||||
return num / denum;
|
||||
if (::fabs(num) < denum * 1.125f)
|
||||
return num > 0 ? 1 : -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
template <int cn>
|
||||
__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
|
||||
int w, int h, const PtrStep<unsigned long long> image_sqsum,
|
||||
unsigned long long templ_sqsum, PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sqsum_ = (float)(
|
||||
(image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
|
||||
(image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
|
||||
float ccorr = result.ptr(y)[x];
|
||||
result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
|
||||
sqrtf(image_sqsum_ * templ_sqsum));
|
||||
}
|
||||
}
|
||||
|
||||
template <int cn>
|
||||
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
|
||||
PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
const dim3 threads(32, 8);
|
||||
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
|
||||
PtrStepSzf result, int cn, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
|
||||
};
|
||||
|
||||
callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Prepared_CCOFF
|
||||
|
||||
__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sum_ = (float)(
|
||||
(image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
|
||||
(image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
|
||||
float ccorr = result.ptr(y)[x];
|
||||
result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
|
||||
}
|
||||
}
|
||||
|
||||
void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
|
||||
int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
|
||||
const PtrStep<unsigned int> image_sum_r,
|
||||
const PtrStep<unsigned int> image_sum_g,
|
||||
PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sum_r_ = (float)(
|
||||
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
|
||||
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
|
||||
float image_sum_g_ = (float)(
|
||||
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
|
||||
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
|
||||
float ccorr = result.ptr(y)[x];
|
||||
result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
|
||||
- image_sum_g_ * templ_sum_scale_g;
|
||||
}
|
||||
}
|
||||
|
||||
void matchTemplatePrepared_CCOFF_8UC2(
|
||||
int w, int h,
|
||||
const PtrStepSz<unsigned int> image_sum_r,
|
||||
const PtrStepSz<unsigned int> image_sum_g,
|
||||
unsigned int templ_sum_r, unsigned int templ_sum_g,
|
||||
PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
|
||||
w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
|
||||
image_sum_r, image_sum_g, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
|
||||
int w, int h,
|
||||
float templ_sum_scale_r,
|
||||
float templ_sum_scale_g,
|
||||
float templ_sum_scale_b,
|
||||
const PtrStep<unsigned int> image_sum_r,
|
||||
const PtrStep<unsigned int> image_sum_g,
|
||||
const PtrStep<unsigned int> image_sum_b,
|
||||
PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sum_r_ = (float)(
|
||||
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
|
||||
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
|
||||
float image_sum_g_ = (float)(
|
||||
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
|
||||
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
|
||||
float image_sum_b_ = (float)(
|
||||
(image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
|
||||
(image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
|
||||
float ccorr = result.ptr(y)[x];
|
||||
result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
|
||||
- image_sum_g_ * templ_sum_scale_g
|
||||
- image_sum_b_ * templ_sum_scale_b;
|
||||
}
|
||||
}
|
||||
|
||||
void matchTemplatePrepared_CCOFF_8UC3(
|
||||
int w, int h,
|
||||
const PtrStepSz<unsigned int> image_sum_r,
|
||||
const PtrStepSz<unsigned int> image_sum_g,
|
||||
const PtrStepSz<unsigned int> image_sum_b,
|
||||
unsigned int templ_sum_r,
|
||||
unsigned int templ_sum_g,
|
||||
unsigned int templ_sum_b,
|
||||
PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
|
||||
w, h,
|
||||
(float)templ_sum_r / (w * h),
|
||||
(float)templ_sum_g / (w * h),
|
||||
(float)templ_sum_b / (w * h),
|
||||
image_sum_r, image_sum_g, image_sum_b, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
|
||||
int w, int h,
|
||||
float templ_sum_scale_r,
|
||||
float templ_sum_scale_g,
|
||||
float templ_sum_scale_b,
|
||||
float templ_sum_scale_a,
|
||||
const PtrStep<unsigned int> image_sum_r,
|
||||
const PtrStep<unsigned int> image_sum_g,
|
||||
const PtrStep<unsigned int> image_sum_b,
|
||||
const PtrStep<unsigned int> image_sum_a,
|
||||
PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sum_r_ = (float)(
|
||||
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
|
||||
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
|
||||
float image_sum_g_ = (float)(
|
||||
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
|
||||
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
|
||||
float image_sum_b_ = (float)(
|
||||
(image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
|
||||
(image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
|
||||
float image_sum_a_ = (float)(
|
||||
(image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
|
||||
(image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
|
||||
float ccorr = result.ptr(y)[x];
|
||||
result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
|
||||
- image_sum_g_ * templ_sum_scale_g
|
||||
- image_sum_b_ * templ_sum_scale_b
|
||||
- image_sum_a_ * templ_sum_scale_a;
|
||||
}
|
||||
}
|
||||
|
||||
void matchTemplatePrepared_CCOFF_8UC4(
|
||||
int w, int h,
|
||||
const PtrStepSz<unsigned int> image_sum_r,
|
||||
const PtrStepSz<unsigned int> image_sum_g,
|
||||
const PtrStepSz<unsigned int> image_sum_b,
|
||||
const PtrStepSz<unsigned int> image_sum_a,
|
||||
unsigned int templ_sum_r,
|
||||
unsigned int templ_sum_g,
|
||||
unsigned int templ_sum_b,
|
||||
unsigned int templ_sum_a,
|
||||
PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
|
||||
w, h,
|
||||
(float)templ_sum_r / (w * h),
|
||||
(float)templ_sum_g / (w * h),
|
||||
(float)templ_sum_b / (w * h),
|
||||
(float)templ_sum_a / (w * h),
|
||||
image_sum_r, image_sum_g, image_sum_b, image_sum_a,
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Prepared_CCOFF_NORMED
|
||||
|
||||
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
|
||||
int w, int h, float weight,
|
||||
float templ_sum_scale, float templ_sqsum_scale,
|
||||
const PtrStep<unsigned int> image_sum,
|
||||
const PtrStep<unsigned long long> image_sqsum,
|
||||
PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float ccorr = result.ptr(y)[x];
|
||||
float image_sum_ = (float)(
|
||||
(image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
|
||||
(image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
|
||||
float image_sqsum_ = (float)(
|
||||
(image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
|
||||
(image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
|
||||
result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
|
||||
sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
|
||||
}
|
||||
}
|
||||
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8U(
|
||||
int w, int h, const PtrStepSz<unsigned int> image_sum,
|
||||
const PtrStepSz<unsigned long long> image_sqsum,
|
||||
unsigned int templ_sum, unsigned long long templ_sqsum,
|
||||
PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
float weight = 1.f / (w * h);
|
||||
float templ_sum_scale = templ_sum * weight;
|
||||
float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
|
||||
|
||||
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
|
||||
w, h, weight, templ_sum_scale, templ_sqsum_scale,
|
||||
image_sum, image_sqsum, result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
|
||||
int w, int h, float weight,
|
||||
float templ_sum_scale_r, float templ_sum_scale_g,
|
||||
float templ_sqsum_scale,
|
||||
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
|
||||
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
|
||||
PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sum_r_ = (float)(
|
||||
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
|
||||
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
|
||||
float image_sqsum_r_ = (float)(
|
||||
(image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
|
||||
(image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
|
||||
float image_sum_g_ = (float)(
|
||||
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
|
||||
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
|
||||
float image_sqsum_g_ = (float)(
|
||||
(image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
|
||||
(image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
|
||||
|
||||
float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
|
||||
- image_sum_g_ * templ_sum_scale_g;
|
||||
float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
|
||||
+ image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
|
||||
result.ptr(y)[x] = normAcc(num, denum);
|
||||
}
|
||||
}
|
||||
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8UC2(
|
||||
int w, int h,
|
||||
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
|
||||
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
|
||||
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
|
||||
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
|
||||
PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
float weight = 1.f / (w * h);
|
||||
float templ_sum_scale_r = templ_sum_r * weight;
|
||||
float templ_sum_scale_g = templ_sum_g * weight;
|
||||
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
|
||||
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
|
||||
|
||||
matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
|
||||
w, h, weight,
|
||||
templ_sum_scale_r, templ_sum_scale_g,
|
||||
templ_sqsum_scale,
|
||||
image_sum_r, image_sqsum_r,
|
||||
image_sum_g, image_sqsum_g,
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
|
||||
int w, int h, float weight,
|
||||
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
|
||||
float templ_sqsum_scale,
|
||||
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
|
||||
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
|
||||
const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
|
||||
PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sum_r_ = (float)(
|
||||
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
|
||||
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
|
||||
float image_sqsum_r_ = (float)(
|
||||
(image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
|
||||
(image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
|
||||
float image_sum_g_ = (float)(
|
||||
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
|
||||
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
|
||||
float image_sqsum_g_ = (float)(
|
||||
(image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
|
||||
(image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
|
||||
float image_sum_b_ = (float)(
|
||||
(image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
|
||||
(image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
|
||||
float image_sqsum_b_ = (float)(
|
||||
(image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
|
||||
(image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
|
||||
|
||||
float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
|
||||
- image_sum_g_ * templ_sum_scale_g
|
||||
- image_sum_b_ * templ_sum_scale_b;
|
||||
float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
|
||||
+ image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
|
||||
+ image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
|
||||
result.ptr(y)[x] = normAcc(num, denum);
|
||||
}
|
||||
}
|
||||
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8UC3(
|
||||
int w, int h,
|
||||
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
|
||||
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
|
||||
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
|
||||
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
|
||||
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
|
||||
unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
|
||||
PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
float weight = 1.f / (w * h);
|
||||
float templ_sum_scale_r = templ_sum_r * weight;
|
||||
float templ_sum_scale_g = templ_sum_g * weight;
|
||||
float templ_sum_scale_b = templ_sum_b * weight;
|
||||
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
|
||||
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g
|
||||
+ templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
|
||||
|
||||
matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
|
||||
w, h, weight,
|
||||
templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
|
||||
templ_sqsum_scale,
|
||||
image_sum_r, image_sqsum_r,
|
||||
image_sum_g, image_sqsum_g,
|
||||
image_sum_b, image_sqsum_b,
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
|
||||
int w, int h, float weight,
|
||||
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
|
||||
float templ_sum_scale_a, float templ_sqsum_scale,
|
||||
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
|
||||
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
|
||||
const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
|
||||
const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
|
||||
PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sum_r_ = (float)(
|
||||
(image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
|
||||
(image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
|
||||
float image_sqsum_r_ = (float)(
|
||||
(image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
|
||||
(image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
|
||||
float image_sum_g_ = (float)(
|
||||
(image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
|
||||
(image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
|
||||
float image_sqsum_g_ = (float)(
|
||||
(image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
|
||||
(image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
|
||||
float image_sum_b_ = (float)(
|
||||
(image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
|
||||
(image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
|
||||
float image_sqsum_b_ = (float)(
|
||||
(image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
|
||||
(image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
|
||||
float image_sum_a_ = (float)(
|
||||
(image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
|
||||
(image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
|
||||
float image_sqsum_a_ = (float)(
|
||||
(image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
|
||||
(image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
|
||||
|
||||
float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
|
||||
- image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
|
||||
float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
|
||||
+ image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
|
||||
+ image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
|
||||
+ image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
|
||||
result.ptr(y)[x] = normAcc(num, denum);
|
||||
}
|
||||
}
|
||||
|
||||
void matchTemplatePrepared_CCOFF_NORMED_8UC4(
|
||||
int w, int h,
|
||||
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
|
||||
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
|
||||
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
|
||||
const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
|
||||
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
|
||||
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
|
||||
unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
|
||||
unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
|
||||
PtrStepSzf result, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
float weight = 1.f / (w * h);
|
||||
float templ_sum_scale_r = templ_sum_r * weight;
|
||||
float templ_sum_scale_g = templ_sum_g * weight;
|
||||
float templ_sum_scale_b = templ_sum_b * weight;
|
||||
float templ_sum_scale_a = templ_sum_a * weight;
|
||||
float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
|
||||
+ templ_sqsum_g - weight * templ_sum_g * templ_sum_g
|
||||
+ templ_sqsum_b - weight * templ_sum_b * templ_sum_b
|
||||
+ templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
|
||||
|
||||
matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
|
||||
w, h, weight,
|
||||
templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
|
||||
templ_sqsum_scale,
|
||||
image_sum_r, image_sqsum_r,
|
||||
image_sum_g, image_sqsum_g,
|
||||
image_sum_b, image_sqsum_b,
|
||||
image_sum_a, image_sqsum_a,
|
||||
result);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// normalize
|
||||
|
||||
template <int cn>
|
||||
__global__ void normalizeKernel_8U(
|
||||
int w, int h, const PtrStep<unsigned long long> image_sqsum,
|
||||
unsigned long long templ_sqsum, PtrStepSzf result)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
float image_sqsum_ = (float)(
|
||||
(image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
|
||||
(image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
|
||||
result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
|
||||
}
|
||||
}
|
||||
|
||||
void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
|
||||
unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
switch (cn)
|
||||
{
|
||||
case 1:
|
||||
normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
|
||||
break;
|
||||
case 2:
|
||||
normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
|
||||
break;
|
||||
case 3:
|
||||
normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
|
||||
break;
|
||||
case 4:
|
||||
normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
|
||||
break;
|
||||
}
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// extractFirstChannel
|
||||
|
||||
template <int cn>
|
||||
__global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
|
||||
{
|
||||
typedef typename TypeVec<float, cn>::vec_type Typef;
|
||||
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < result.cols && y < result.rows)
|
||||
{
|
||||
Typef val = ((const Typef*)image.ptr(y))[x];
|
||||
result.ptr(y)[x] = first(val);
|
||||
}
|
||||
}
|
||||
|
||||
void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
|
||||
|
||||
switch (cn)
|
||||
{
|
||||
case 1:
|
||||
extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
|
||||
break;
|
||||
case 2:
|
||||
extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
|
||||
break;
|
||||
case 3:
|
||||
extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
|
||||
break;
|
||||
case 4:
|
||||
extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
|
||||
break;
|
||||
}
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
} //namespace match_template
|
||||
}}} // namespace cv { namespace gpu { namespace cudev
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
182
modules/gpuimgproc/src/cuda/mean_shift.cu
Normal file
182
modules/gpuimgproc/src/cuda/mean_shift.cu
Normal file
@@ -0,0 +1,182 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/border_interpolate.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace cudev
|
||||
{
|
||||
namespace imgproc
|
||||
{
|
||||
texture<uchar4, 2> tex_meanshift;
|
||||
|
||||
__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
|
||||
size_t out_step, int cols, int rows,
|
||||
int sp, int sr, int maxIter, float eps)
|
||||
{
|
||||
int isr2 = sr*sr;
|
||||
uchar4 c = tex2D(tex_meanshift, x0, y0 );
|
||||
|
||||
// iterate meanshift procedure
|
||||
for( int iter = 0; iter < maxIter; iter++ )
|
||||
{
|
||||
int count = 0;
|
||||
int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
|
||||
float icount;
|
||||
|
||||
//mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
|
||||
int minx = x0-sp;
|
||||
int miny = y0-sp;
|
||||
int maxx = x0+sp;
|
||||
int maxy = y0+sp;
|
||||
|
||||
for( int y = miny; y <= maxy; y++)
|
||||
{
|
||||
int rowCount = 0;
|
||||
for( int x = minx; x <= maxx; x++ )
|
||||
{
|
||||
uchar4 t = tex2D( tex_meanshift, x, y );
|
||||
|
||||
int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
|
||||
if( norm2 <= isr2 )
|
||||
{
|
||||
s0 += t.x; s1 += t.y; s2 += t.z;
|
||||
sx += x; rowCount++;
|
||||
}
|
||||
}
|
||||
count += rowCount;
|
||||
sy += y*rowCount;
|
||||
}
|
||||
|
||||
if( count == 0 )
|
||||
break;
|
||||
|
||||
icount = 1.f/count;
|
||||
int x1 = __float2int_rz(sx*icount);
|
||||
int y1 = __float2int_rz(sy*icount);
|
||||
s0 = __float2int_rz(s0*icount);
|
||||
s1 = __float2int_rz(s1*icount);
|
||||
s2 = __float2int_rz(s2*icount);
|
||||
|
||||
int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
|
||||
|
||||
bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
|
||||
|
||||
x0 = x1; y0 = y1;
|
||||
c.x = s0; c.y = s1; c.z = s2;
|
||||
|
||||
if( stopFlag )
|
||||
break;
|
||||
}
|
||||
|
||||
int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
|
||||
*(uchar4*)(out + base) = c;
|
||||
|
||||
return make_short2((short)x0, (short)y0);
|
||||
}
|
||||
|
||||
__global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
|
||||
{
|
||||
int x0 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y0 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if( x0 < cols && y0 < rows )
|
||||
do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
|
||||
}
|
||||
|
||||
void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
|
||||
{
|
||||
dim3 grid(1, 1, 1);
|
||||
dim3 threads(32, 8, 1);
|
||||
grid.x = divUp(src.cols, threads.x);
|
||||
grid.y = divUp(src.rows, threads.y);
|
||||
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
|
||||
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
|
||||
|
||||
meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
__global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
|
||||
unsigned char* outsp, size_t outspstep,
|
||||
int cols, int rows,
|
||||
int sp, int sr, int maxIter, float eps)
|
||||
{
|
||||
int x0 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y0 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if( x0 < cols && y0 < rows )
|
||||
{
|
||||
int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
|
||||
*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
|
||||
}
|
||||
}
|
||||
|
||||
void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
|
||||
{
|
||||
dim3 grid(1, 1, 1);
|
||||
dim3 threads(32, 8, 1);
|
||||
grid.x = divUp(src.cols, threads.x);
|
||||
grid.y = divUp(src.rows, threads.y);
|
||||
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
|
||||
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
|
||||
|
||||
meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user