Revert "Revert "Merge pull request #836 from jet47:gpu-modules""
This commit is contained in:
		
							
								
								
									
										199
									
								
								modules/gpuimgproc/src/cuda/bilateral_filter.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										199
									
								
								modules/gpuimgproc/src/cuda/bilateral_filter.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,199 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/vec_traits.hpp" | ||||
| #include "opencv2/core/cuda/vec_math.hpp" | ||||
| #include "opencv2/core/cuda/border_interpolate.hpp" | ||||
|  | ||||
| using namespace cv::gpu; | ||||
|  | ||||
| typedef unsigned char uchar; | ||||
| typedef unsigned short ushort; | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////// | ||||
| /// Bilateral filtering | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     namespace imgproc | ||||
|     { | ||||
|         __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); } | ||||
|         __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); } | ||||
|         __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); } | ||||
|         __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); } | ||||
|  | ||||
|         __device__ __forceinline__ float sqr(const float& a)  { return a * a; } | ||||
|  | ||||
|         template<typename T, typename B> | ||||
|         __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half) | ||||
|         { | ||||
|             typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type; | ||||
|  | ||||
|             int x = threadIdx.x + blockIdx.x * blockDim.x; | ||||
|             int y = threadIdx.y + blockIdx.y * blockDim.y; | ||||
|  | ||||
|             if (x >= src.cols || y >= src.rows) | ||||
|                 return; | ||||
|  | ||||
|             value_type center = saturate_cast<value_type>(src(y, x)); | ||||
|  | ||||
|             value_type sum1 = VecTraits<value_type>::all(0); | ||||
|             float sum2 = 0; | ||||
|  | ||||
|             int r = ksz / 2; | ||||
|             float r2 = (float)(r * r); | ||||
|  | ||||
|             int tx = x - r + ksz; | ||||
|             int ty = y - r + ksz; | ||||
|  | ||||
|             if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows) | ||||
|             { | ||||
|                 for (int cy = y - r; cy < ty; ++cy) | ||||
|                     for (int cx = x - r; cx < tx; ++cx) | ||||
|                     { | ||||
|                         float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy); | ||||
|                         if (space2 > r2) | ||||
|                             continue; | ||||
|  | ||||
|                         value_type value = saturate_cast<value_type>(src(cy, cx)); | ||||
|  | ||||
|                         float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half); | ||||
|                         sum1 = sum1 + weight * value; | ||||
|                         sum2 = sum2 + weight; | ||||
|                     } | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 for (int cy = y - r; cy < ty; ++cy) | ||||
|                     for (int cx = x - r; cx < tx; ++cx) | ||||
|                     { | ||||
|                         float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy); | ||||
|                         if (space2 > r2) | ||||
|                             continue; | ||||
|  | ||||
|                         value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step)); | ||||
|  | ||||
|                         float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half); | ||||
|  | ||||
|                         sum1 = sum1 + weight * value; | ||||
|                         sum2 = sum2 + weight; | ||||
|                     } | ||||
|             } | ||||
|             dst(y, x) = saturate_cast<T>(sum1 / sum2); | ||||
|         } | ||||
|  | ||||
|         template<typename T, template <typename> class B> | ||||
|         void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 block (32, 8); | ||||
|             dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y)); | ||||
|  | ||||
|             B<T> b(src.rows, src.cols); | ||||
|  | ||||
|             float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial); | ||||
|              float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color); | ||||
|  | ||||
|             cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) ); | ||||
|             bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half); | ||||
|             cudaSafeCall ( cudaGetLastError () ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|         template<typename T> | ||||
|         void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream) | ||||
|         { | ||||
|             typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream); | ||||
|  | ||||
|             static caller_t funcs[] = | ||||
|             { | ||||
|                 bilateral_caller<T, BrdConstant>, | ||||
|                 bilateral_caller<T, BrdReplicate>, | ||||
|                 bilateral_caller<T, BrdReflect>, | ||||
|                 bilateral_caller<T, BrdWrap>, | ||||
|                 bilateral_caller<T, BrdReflect101> | ||||
|             }; | ||||
|             funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream); | ||||
|         } | ||||
|     } | ||||
| }}} | ||||
|  | ||||
|  | ||||
| #define OCV_INSTANTIATE_BILATERAL_FILTER(T) \ | ||||
|     template void cv::gpu::cudev::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t); | ||||
|  | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(uchar) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(uchar2) | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(uchar3) | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(uchar4) | ||||
|  | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(schar) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(schar2) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(schar3) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(schar4) | ||||
|  | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(short) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(short2) | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(short3) | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(short4) | ||||
|  | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(ushort) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(ushort2) | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(ushort3) | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(ushort4) | ||||
|  | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(int) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(int2) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(int3) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(int4) | ||||
|  | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(float) | ||||
| //OCV_INSTANTIATE_BILATERAL_FILTER(float2) | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(float3) | ||||
| OCV_INSTANTIATE_BILATERAL_FILTER(float4) | ||||
|  | ||||
|  | ||||
| #endif /* CUDA_DISABLER */ | ||||
							
								
								
									
										121
									
								
								modules/gpuimgproc/src/cuda/blend.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										121
									
								
								modules/gpuimgproc/src/cuda/blend.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,121 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     namespace blend | ||||
|     { | ||||
|         template <typename T> | ||||
|         __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2, | ||||
|                                           const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result) | ||||
|         { | ||||
|             int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (y < rows && x < cols) | ||||
|             { | ||||
|                 int x_ = x / cn; | ||||
|                 float w1 = weights1.ptr(y)[x_]; | ||||
|                 float w2 = weights2.ptr(y)[x_]; | ||||
|                 T p1 = img1.ptr(y)[x]; | ||||
|                 T p2 = img2.ptr(y)[x]; | ||||
|                 result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         template <typename T> | ||||
|         void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(16, 16); | ||||
|             dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y)); | ||||
|  | ||||
|             blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall(cudaDeviceSynchronize()); | ||||
|         } | ||||
|  | ||||
|         template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream); | ||||
|         template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream); | ||||
|  | ||||
|  | ||||
|         __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2, | ||||
|                                               const PtrStepf weights1, const PtrStepf weights2, PtrStepb result) | ||||
|         { | ||||
|             int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (y < rows && x < cols) | ||||
|             { | ||||
|                 float w1 = weights1.ptr(y)[x]; | ||||
|                 float w2 = weights2.ptr(y)[x]; | ||||
|                 float sum_inv = 1.f / (w1 + w2 + 1e-5f); | ||||
|                 w1 *= sum_inv; | ||||
|                 w2 *= sum_inv; | ||||
|                 uchar4 p1 = ((const uchar4*)img1.ptr(y))[x]; | ||||
|                 uchar4 p2 = ((const uchar4*)img2.ptr(y))[x]; | ||||
|                 ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2, | ||||
|                                                           p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(16, 16); | ||||
|             dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); | ||||
|  | ||||
|             blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall(cudaDeviceSynchronize()); | ||||
|         } | ||||
|     } // namespace blend | ||||
| }}} // namespace cv { namespace gpu { namespace cudev | ||||
|  | ||||
|  | ||||
| #endif /* CUDA_DISABLER */ | ||||
							
								
								
									
										494
									
								
								modules/gpuimgproc/src/cuda/canny.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										494
									
								
								modules/gpuimgproc/src/cuda/canny.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,494 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include <utility> | ||||
| #include <algorithm>//std::swap | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/emulation.hpp" | ||||
| #include "opencv2/core/cuda/transform.hpp" | ||||
| #include "opencv2/core/cuda/functional.hpp" | ||||
| #include "opencv2/core/cuda/utility.hpp" | ||||
|  | ||||
| using namespace cv::gpu; | ||||
| using namespace cv::gpu::cudev; | ||||
|  | ||||
| namespace canny | ||||
| { | ||||
|     struct L1 : binary_function<int, int, float> | ||||
|     { | ||||
|         __device__ __forceinline__ float operator ()(int x, int y) const | ||||
|         { | ||||
|             return ::abs(x) + ::abs(y); | ||||
|         } | ||||
|  | ||||
|         __host__ __device__ __forceinline__ L1() {} | ||||
|         __host__ __device__ __forceinline__ L1(const L1&) {} | ||||
|     }; | ||||
|     struct L2 : binary_function<int, int, float> | ||||
|     { | ||||
|         __device__ __forceinline__ float operator ()(int x, int y) const | ||||
|         { | ||||
|             return ::sqrtf(x * x + y * y); | ||||
|         } | ||||
|  | ||||
|         __host__ __device__ __forceinline__ L2() {} | ||||
|         __host__ __device__ __forceinline__ L2(const L2&) {} | ||||
|     }; | ||||
| } | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1> | ||||
|     { | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2> | ||||
|     { | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
| }}} | ||||
|  | ||||
| namespace canny | ||||
| { | ||||
|     texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); | ||||
|     struct SrcTex | ||||
|     { | ||||
|         int xoff; | ||||
|         int yoff; | ||||
|         __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} | ||||
|  | ||||
|         __device__ __forceinline__ int operator ()(int y, int x) const | ||||
|         { | ||||
|             return tex2D(tex_src, x + xoff, y + yoff); | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     template <class Norm> __global__ | ||||
|     void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) | ||||
|     { | ||||
|         const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|         const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|         if (y >= mag.rows || x >= mag.cols) | ||||
|             return; | ||||
|  | ||||
|         int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); | ||||
|         int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); | ||||
|  | ||||
|         dx(y, x) = dxVal; | ||||
|         dy(y, x) = dyVal; | ||||
|  | ||||
|         mag(y, x) = norm(dxVal, dyVal); | ||||
|     } | ||||
|  | ||||
|     void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) | ||||
|     { | ||||
|         const dim3 block(16, 16); | ||||
|         const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); | ||||
|  | ||||
|         bindTexture(&tex_src, srcWhole); | ||||
|         SrcTex src(xoff, yoff); | ||||
|  | ||||
|         if (L2Grad) | ||||
|         { | ||||
|             L2 norm; | ||||
|             calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm); | ||||
|         } | ||||
|         else | ||||
|         { | ||||
|             L1 norm; | ||||
|             calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm); | ||||
|         } | ||||
|  | ||||
|         cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|         cudaSafeCall(cudaThreadSynchronize()); | ||||
|     } | ||||
|  | ||||
|     void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) | ||||
|     { | ||||
|         if (L2Grad) | ||||
|         { | ||||
|             L2 norm; | ||||
|             transform(dx, dy, mag, norm, WithOutMask(), 0); | ||||
|         } | ||||
|         else | ||||
|         { | ||||
|             L1 norm; | ||||
|             transform(dx, dy, mag, norm, WithOutMask(), 0); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| namespace canny | ||||
| { | ||||
|     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); | ||||
|  | ||||
|     __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) | ||||
|     { | ||||
|         const int CANNY_SHIFT = 15; | ||||
|         const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5); | ||||
|  | ||||
|         const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|         const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|         if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1) | ||||
|             return; | ||||
|  | ||||
|         int dxVal = dx(y, x); | ||||
|         int dyVal = dy(y, x); | ||||
|  | ||||
|         const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; | ||||
|         const float m = tex2D(tex_mag, x, y); | ||||
|  | ||||
|         dxVal = ::abs(dxVal); | ||||
|         dyVal = ::abs(dyVal); | ||||
|  | ||||
|         // 0 - the pixel can not belong to an edge | ||||
|         // 1 - the pixel might belong to an edge | ||||
|         // 2 - the pixel does belong to an edge | ||||
|         int edge_type = 0; | ||||
|  | ||||
|         if (m > low_thresh) | ||||
|         { | ||||
|             const int tg22x = dxVal * TG22; | ||||
|             const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); | ||||
|  | ||||
|             dyVal <<= CANNY_SHIFT; | ||||
|  | ||||
|             if (dyVal < tg22x) | ||||
|             { | ||||
|                 if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) | ||||
|                     edge_type = 1 + (int)(m > high_thresh); | ||||
|             } | ||||
|             else if(dyVal > tg67x) | ||||
|             { | ||||
|                 if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) | ||||
|                     edge_type = 1 + (int)(m > high_thresh); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) | ||||
|                     edge_type = 1 + (int)(m > high_thresh); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         map(y, x) = edge_type; | ||||
|     } | ||||
|  | ||||
|     void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) | ||||
|     { | ||||
|         const dim3 block(16, 16); | ||||
|         const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); | ||||
|  | ||||
|         bindTexture(&tex_mag, mag); | ||||
|  | ||||
|         calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh); | ||||
|         cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|         cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|     } | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| namespace canny | ||||
| { | ||||
|     __device__ int counter = 0; | ||||
|  | ||||
|     __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st) | ||||
|     { | ||||
|         __shared__ volatile int smem[18][18]; | ||||
|  | ||||
|         const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|         const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|         smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0; | ||||
|         if (threadIdx.y == 0) | ||||
|             smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0; | ||||
|         if (threadIdx.y == blockDim.y - 1) | ||||
|             smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0; | ||||
|         if (threadIdx.x == 0) | ||||
|             smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0; | ||||
|         if (threadIdx.x == blockDim.x - 1) | ||||
|             smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0; | ||||
|         if (threadIdx.x == 0 && threadIdx.y == 0) | ||||
|             smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0; | ||||
|         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) | ||||
|             smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0; | ||||
|         if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) | ||||
|             smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0; | ||||
|         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) | ||||
|             smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0; | ||||
|  | ||||
|         __syncthreads(); | ||||
|  | ||||
|         if (x >= map.cols || y >= map.rows) | ||||
|             return; | ||||
|  | ||||
|         int n; | ||||
|  | ||||
|         #pragma unroll | ||||
|         for (int k = 0; k < 16; ++k) | ||||
|         { | ||||
|             n = 0; | ||||
|  | ||||
|             if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) | ||||
|             { | ||||
|                 n += smem[threadIdx.y    ][threadIdx.x    ] == 2; | ||||
|                 n += smem[threadIdx.y    ][threadIdx.x + 1] == 2; | ||||
|                 n += smem[threadIdx.y    ][threadIdx.x + 2] == 2; | ||||
|  | ||||
|                 n += smem[threadIdx.y + 1][threadIdx.x    ] == 2; | ||||
|                 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; | ||||
|  | ||||
|                 n += smem[threadIdx.y + 2][threadIdx.x    ] == 2; | ||||
|                 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; | ||||
|                 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; | ||||
|             } | ||||
|  | ||||
|             if (n > 0) | ||||
|                 smem[threadIdx.y + 1][threadIdx.x + 1] = 2; | ||||
|         } | ||||
|  | ||||
|         const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; | ||||
|  | ||||
|         map(y, x) = e; | ||||
|  | ||||
|         n = 0; | ||||
|  | ||||
|         if (e == 2) | ||||
|         { | ||||
|             n += smem[threadIdx.y    ][threadIdx.x    ] == 1; | ||||
|             n += smem[threadIdx.y    ][threadIdx.x + 1] == 1; | ||||
|             n += smem[threadIdx.y    ][threadIdx.x + 2] == 1; | ||||
|  | ||||
|             n += smem[threadIdx.y + 1][threadIdx.x    ] == 1; | ||||
|             n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; | ||||
|  | ||||
|             n += smem[threadIdx.y + 2][threadIdx.x    ] == 1; | ||||
|             n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; | ||||
|             n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; | ||||
|         } | ||||
|  | ||||
|         if (n > 0) | ||||
|         { | ||||
|             const int ind =  ::atomicAdd(&counter, 1); | ||||
|             st[ind] = make_ushort2(x, y); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1) | ||||
|     { | ||||
|         void* counter_ptr; | ||||
|         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); | ||||
|  | ||||
|         cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); | ||||
|  | ||||
|         const dim3 block(16, 16); | ||||
|         const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); | ||||
|  | ||||
|         edgesHysteresisLocalKernel<<<grid, block>>>(map, st1); | ||||
|         cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|         cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|     } | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| namespace canny | ||||
| { | ||||
|     __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1}; | ||||
|     __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1}; | ||||
|  | ||||
|     __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count) | ||||
|     { | ||||
|         const int stack_size = 512; | ||||
|  | ||||
|         __shared__ int s_counter; | ||||
|         __shared__ int s_ind; | ||||
|         __shared__ ushort2 s_st[stack_size]; | ||||
|  | ||||
|         if (threadIdx.x == 0) | ||||
|             s_counter = 0; | ||||
|  | ||||
|         __syncthreads(); | ||||
|  | ||||
|         int ind = blockIdx.y * gridDim.x + blockIdx.x; | ||||
|  | ||||
|         if (ind >= count) | ||||
|             return; | ||||
|  | ||||
|         ushort2 pos = st1[ind]; | ||||
|  | ||||
|         if (threadIdx.x < 8) | ||||
|         { | ||||
|             pos.x += c_dx[threadIdx.x]; | ||||
|             pos.y += c_dy[threadIdx.x]; | ||||
|  | ||||
|             if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1) | ||||
|             { | ||||
|                 map(pos.y, pos.x) = 2; | ||||
|  | ||||
|                 ind = Emulation::smem::atomicAdd(&s_counter, 1); | ||||
|  | ||||
|                 s_st[ind] = pos; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         __syncthreads(); | ||||
|  | ||||
|         while (s_counter > 0 && s_counter <= stack_size - blockDim.x) | ||||
|         { | ||||
|             const int subTaskIdx = threadIdx.x >> 3; | ||||
|             const int portion = ::min(s_counter, blockDim.x >> 3); | ||||
|  | ||||
|             if (subTaskIdx < portion) | ||||
|                 pos = s_st[s_counter - 1 - subTaskIdx]; | ||||
|  | ||||
|             __syncthreads(); | ||||
|  | ||||
|             if (threadIdx.x == 0) | ||||
|                 s_counter -= portion; | ||||
|  | ||||
|             __syncthreads(); | ||||
|  | ||||
|             if (subTaskIdx < portion) | ||||
|             { | ||||
|                 pos.x += c_dx[threadIdx.x & 7]; | ||||
|                 pos.y += c_dy[threadIdx.x & 7]; | ||||
|  | ||||
|                 if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1) | ||||
|                 { | ||||
|                     map(pos.y, pos.x) = 2; | ||||
|  | ||||
|                     ind = Emulation::smem::atomicAdd(&s_counter, 1); | ||||
|  | ||||
|                     s_st[ind] = pos; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             __syncthreads(); | ||||
|         } | ||||
|  | ||||
|         if (s_counter > 0) | ||||
|         { | ||||
|             if (threadIdx.x == 0) | ||||
|             { | ||||
|                 ind = ::atomicAdd(&counter, s_counter); | ||||
|                 s_ind = ind - s_counter; | ||||
|             } | ||||
|  | ||||
|             __syncthreads(); | ||||
|  | ||||
|             ind = s_ind; | ||||
|  | ||||
|             for (int i = threadIdx.x; i < s_counter; i += blockDim.x) | ||||
|                 st2[ind + i] = s_st[i]; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2) | ||||
|     { | ||||
|         void* counter_ptr; | ||||
|         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) ); | ||||
|  | ||||
|         int count; | ||||
|         cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); | ||||
|  | ||||
|         while (count > 0) | ||||
|         { | ||||
|             cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); | ||||
|  | ||||
|             const dim3 block(128); | ||||
|             const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); | ||||
|  | ||||
|             edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|  | ||||
|             cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); | ||||
|  | ||||
|             std::swap(st1, st2); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| namespace canny | ||||
| { | ||||
|     struct GetEdges : unary_function<int, uchar> | ||||
|     { | ||||
|         __device__ __forceinline__ uchar operator ()(int e) const | ||||
|         { | ||||
|             return (uchar)(-(e >> 1)); | ||||
|         } | ||||
|  | ||||
|         __host__ __device__ __forceinline__ GetEdges() {} | ||||
|         __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {} | ||||
|     }; | ||||
| } | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges> | ||||
|     { | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
| }}} | ||||
|  | ||||
| namespace canny | ||||
| { | ||||
|     void getEdges(PtrStepSzi map, PtrStepSzb dst) | ||||
|     { | ||||
|         transform(map, dst, GetEdges(), WithOutMask(), 0); | ||||
|     } | ||||
| } | ||||
|  | ||||
| #endif /* CUDA_DISABLER */ | ||||
							
								
								
									
										186
									
								
								modules/gpuimgproc/src/cuda/clahe.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										186
									
								
								modules/gpuimgproc/src/cuda/clahe.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,186 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/functional.hpp" | ||||
| #include "opencv2/core/cuda/emulation.hpp" | ||||
| #include "opencv2/core/cuda/scan.hpp" | ||||
| #include "opencv2/core/cuda/reduce.hpp" | ||||
| #include "opencv2/core/cuda/saturate_cast.hpp" | ||||
|  | ||||
| using namespace cv::gpu; | ||||
| using namespace cv::gpu::cudev; | ||||
|  | ||||
| namespace clahe | ||||
| { | ||||
|     __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut, | ||||
|                                   const int2 tileSize, const int tilesX, | ||||
|                                   const int clipLimit, const float lutScale) | ||||
|     { | ||||
|         __shared__ int smem[512]; | ||||
|  | ||||
|         const int tx = blockIdx.x; | ||||
|         const int ty = blockIdx.y; | ||||
|         const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; | ||||
|  | ||||
|         smem[tid] = 0; | ||||
|         __syncthreads(); | ||||
|  | ||||
|         for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y) | ||||
|         { | ||||
|             const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x; | ||||
|             for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x) | ||||
|             { | ||||
|                 const int data = srcPtr[j]; | ||||
|                 Emulation::smem::atomicAdd(&smem[data], 1); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         __syncthreads(); | ||||
|  | ||||
|         int tHistVal = smem[tid]; | ||||
|  | ||||
|         __syncthreads(); | ||||
|  | ||||
|         if (clipLimit > 0) | ||||
|         { | ||||
|             // clip histogram bar | ||||
|  | ||||
|             int clipped = 0; | ||||
|             if (tHistVal > clipLimit) | ||||
|             { | ||||
|                 clipped = tHistVal - clipLimit; | ||||
|                 tHistVal = clipLimit; | ||||
|             } | ||||
|  | ||||
|             // find number of overall clipped samples | ||||
|  | ||||
|             reduce<256>(smem, clipped, tid, plus<int>()); | ||||
|  | ||||
|             // broadcast evaluated value | ||||
|  | ||||
|             __shared__ int totalClipped; | ||||
|  | ||||
|             if (tid == 0) | ||||
|                 totalClipped = clipped; | ||||
|             __syncthreads(); | ||||
|  | ||||
|             // redistribute clipped samples evenly | ||||
|  | ||||
|             int redistBatch = totalClipped / 256; | ||||
|             tHistVal += redistBatch; | ||||
|  | ||||
|             int residual = totalClipped - redistBatch * 256; | ||||
|             if (tid < residual) | ||||
|                 ++tHistVal; | ||||
|         } | ||||
|  | ||||
|         const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid); | ||||
|  | ||||
|         lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal)); | ||||
|     } | ||||
|  | ||||
|     void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream) | ||||
|     { | ||||
|         const dim3 block(32, 8); | ||||
|         const dim3 grid(tilesX, tilesY); | ||||
|  | ||||
|         calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale); | ||||
|  | ||||
|         cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|         if (stream == 0) | ||||
|             cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|     } | ||||
|  | ||||
|     __global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY) | ||||
|     { | ||||
|         const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|         const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|         if (x >= src.cols || y >= src.rows) | ||||
|             return; | ||||
|  | ||||
|         const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f; | ||||
|         int ty1 = __float2int_rd(tyf); | ||||
|         int ty2 = ty1 + 1; | ||||
|         const float ya = tyf - ty1; | ||||
|         ty1 = ::max(ty1, 0); | ||||
|         ty2 = ::min(ty2, tilesY - 1); | ||||
|  | ||||
|         const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f; | ||||
|         int tx1 = __float2int_rd(txf); | ||||
|         int tx2 = tx1 + 1; | ||||
|         const float xa = txf - tx1; | ||||
|         tx1 = ::max(tx1, 0); | ||||
|         tx2 = ::min(tx2, tilesX - 1); | ||||
|  | ||||
|         const int srcVal = src(y, x); | ||||
|  | ||||
|         float res = 0; | ||||
|  | ||||
|         res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya)); | ||||
|         res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya)); | ||||
|         res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya)); | ||||
|         res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya)); | ||||
|  | ||||
|         dst(y, x) = saturate_cast<uchar>(res); | ||||
|     } | ||||
|  | ||||
|     void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream) | ||||
|     { | ||||
|         const dim3 block(32, 8); | ||||
|         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); | ||||
|  | ||||
|         cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) ); | ||||
|  | ||||
|         tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY); | ||||
|         cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|         if (stream == 0) | ||||
|             cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|     } | ||||
| } | ||||
|  | ||||
| #endif // CUDA_DISABLER | ||||
							
								
								
									
										461
									
								
								modules/gpuimgproc/src/cuda/color.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										461
									
								
								modules/gpuimgproc/src/cuda/color.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,461 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/transform.hpp" | ||||
| #include "opencv2/core/cuda/color.hpp" | ||||
| #include "cvt_color_internal.h" | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_x = 8 }; | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type) | ||||
|     { | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|     OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type) | ||||
|     { | ||||
|         enum { smart_block_dim_y = 8 }; | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
|  | ||||
| #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \ | ||||
|     void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \ | ||||
|     { \ | ||||
|         traits::functor_type functor = traits::create_functor(); \ | ||||
|         typedef typename traits::functor_type::argument_type src_t; \ | ||||
|         typedef typename traits::functor_type::result_type   dst_t; \ | ||||
|         cv::gpu::cudev::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \ | ||||
|     } | ||||
|  | ||||
| #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits) | ||||
|  | ||||
| #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) | ||||
|  | ||||
| #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) | ||||
|  | ||||
| #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \ | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra) | ||||
|  | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra) | ||||
|     OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra) | ||||
|  | ||||
|     #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR | ||||
|     #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE | ||||
|     #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL | ||||
|     #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F | ||||
|     #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL | ||||
| }}} // namespace cv { namespace gpu { namespace cudev | ||||
|  | ||||
| #endif /* CUDA_DISABLER */ | ||||
							
								
								
									
										274
									
								
								modules/gpuimgproc/src/cuda/corners.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										274
									
								
								modules/gpuimgproc/src/cuda/corners.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,274 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/vec_traits.hpp" | ||||
| #include "opencv2/core/cuda/vec_math.hpp" | ||||
| #include "opencv2/core/cuda/saturate_cast.hpp" | ||||
| #include "opencv2/core/cuda/border_interpolate.hpp" | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     namespace imgproc | ||||
|     { | ||||
|         /////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// | ||||
|  | ||||
|         texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp); | ||||
|         texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp); | ||||
|  | ||||
|         __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < dst.cols && y < dst.rows) | ||||
|             { | ||||
|                 float a = 0.f; | ||||
|                 float b = 0.f; | ||||
|                 float c = 0.f; | ||||
|  | ||||
|                 const int ibegin = y - (block_size / 2); | ||||
|                 const int jbegin = x - (block_size / 2); | ||||
|                 const int iend = ibegin + block_size; | ||||
|                 const int jend = jbegin + block_size; | ||||
|  | ||||
|                 for (int i = ibegin; i < iend; ++i) | ||||
|                 { | ||||
|                     for (int j = jbegin; j < jend; ++j) | ||||
|                     { | ||||
|                         float dx = tex2D(harrisDxTex, j, i); | ||||
|                         float dy = tex2D(harrisDyTex, j, i); | ||||
|  | ||||
|                         a += dx * dx; | ||||
|                         b += dx * dy; | ||||
|                         c += dy * dy; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 dst(y, x) = a * c - b * b - k * (a + c) * (a + c); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         template <typename BR, typename BC> | ||||
|         __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < dst.cols && y < dst.rows) | ||||
|             { | ||||
|                 float a = 0.f; | ||||
|                 float b = 0.f; | ||||
|                 float c = 0.f; | ||||
|  | ||||
|                 const int ibegin = y - (block_size / 2); | ||||
|                 const int jbegin = x - (block_size / 2); | ||||
|                 const int iend = ibegin + block_size; | ||||
|                 const int jend = jbegin + block_size; | ||||
|  | ||||
|                 for (int i = ibegin; i < iend; ++i) | ||||
|                 { | ||||
|                     const int y = border_col.idx_row(i); | ||||
|  | ||||
|                     for (int j = jbegin; j < jend; ++j) | ||||
|                     { | ||||
|                         const int x = border_row.idx_col(j); | ||||
|  | ||||
|                         float dx = tex2D(harrisDxTex, x, y); | ||||
|                         float dy = tex2D(harrisDyTex, x, y); | ||||
|  | ||||
|                         a += dx * dx; | ||||
|                         b += dx * dy; | ||||
|                         c += dy * dy; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 dst(y, x) = a * c - b * b - k * (a + c) * (a + c); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 block(32, 8); | ||||
|             dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y)); | ||||
|  | ||||
|             bindTexture(&harrisDxTex, Dx); | ||||
|             bindTexture(&harrisDyTex, Dy); | ||||
|  | ||||
|             switch (border_type) | ||||
|             { | ||||
|             case BORDER_REFLECT101: | ||||
|                 cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows)); | ||||
|                 break; | ||||
|  | ||||
|             case BORDER_REFLECT: | ||||
|                 cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows)); | ||||
|                 break; | ||||
|  | ||||
|             case BORDER_REPLICATE: | ||||
|                 cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst); | ||||
|                 break; | ||||
|             } | ||||
|  | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|         /////////////////////////////////////////// Corner Min Eigen Val ///////////////////////////////////////////////// | ||||
|  | ||||
|         texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp); | ||||
|         texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp); | ||||
|  | ||||
|         __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < dst.cols && y < dst.rows) | ||||
|             { | ||||
|                 float a = 0.f; | ||||
|                 float b = 0.f; | ||||
|                 float c = 0.f; | ||||
|  | ||||
|                 const int ibegin = y - (block_size / 2); | ||||
|                 const int jbegin = x - (block_size / 2); | ||||
|                 const int iend = ibegin + block_size; | ||||
|                 const int jend = jbegin + block_size; | ||||
|  | ||||
|                 for (int i = ibegin; i < iend; ++i) | ||||
|                 { | ||||
|                     for (int j = jbegin; j < jend; ++j) | ||||
|                     { | ||||
|                         float dx = tex2D(minEigenValDxTex, j, i); | ||||
|                         float dy = tex2D(minEigenValDyTex, j, i); | ||||
|  | ||||
|                         a += dx * dx; | ||||
|                         b += dx * dy; | ||||
|                         c += dy * dy; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 a *= 0.5f; | ||||
|                 c *= 0.5f; | ||||
|  | ||||
|                 dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|  | ||||
|         template <typename BR, typename BC> | ||||
|         __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < dst.cols && y < dst.rows) | ||||
|             { | ||||
|                 float a = 0.f; | ||||
|                 float b = 0.f; | ||||
|                 float c = 0.f; | ||||
|  | ||||
|                 const int ibegin = y - (block_size / 2); | ||||
|                 const int jbegin = x - (block_size / 2); | ||||
|                 const int iend = ibegin + block_size; | ||||
|                 const int jend = jbegin + block_size; | ||||
|  | ||||
|                 for (int i = ibegin; i < iend; ++i) | ||||
|                 { | ||||
|                     int y = border_col.idx_row(i); | ||||
|  | ||||
|                     for (int j = jbegin; j < jend; ++j) | ||||
|                     { | ||||
|                         int x = border_row.idx_col(j); | ||||
|  | ||||
|                         float dx = tex2D(minEigenValDxTex, x, y); | ||||
|                         float dy = tex2D(minEigenValDyTex, x, y); | ||||
|  | ||||
|                         a += dx * dx; | ||||
|                         b += dx * dy; | ||||
|                         c += dy * dy; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 a *= 0.5f; | ||||
|                 c *= 0.5f; | ||||
|  | ||||
|                 dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 block(32, 8); | ||||
|             dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y)); | ||||
|  | ||||
|             bindTexture(&minEigenValDxTex, Dx); | ||||
|             bindTexture(&minEigenValDyTex, Dy); | ||||
|  | ||||
|             switch (border_type) | ||||
|             { | ||||
|             case BORDER_REFLECT101: | ||||
|                 cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows)); | ||||
|                 break; | ||||
|  | ||||
|             case BORDER_REFLECT: | ||||
|                 cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows)); | ||||
|                 break; | ||||
|  | ||||
|             case BORDER_REPLICATE: | ||||
|                 cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst); | ||||
|                 break; | ||||
|             } | ||||
|  | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall(cudaDeviceSynchronize()); | ||||
|         } | ||||
|     } | ||||
| }}} | ||||
|  | ||||
| #endif | ||||
							
								
								
									
										544
									
								
								modules/gpuimgproc/src/cuda/debayer.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										544
									
								
								modules/gpuimgproc/src/cuda/debayer.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,544 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/vec_traits.hpp" | ||||
| #include "opencv2/core/cuda/vec_math.hpp" | ||||
| #include "opencv2/core/cuda/limits.hpp" | ||||
| #include "opencv2/core/cuda/color.hpp" | ||||
| #include "opencv2/core/cuda/saturate_cast.hpp" | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     template <typename T> struct Bayer2BGR; | ||||
|  | ||||
|     template <> struct Bayer2BGR<uchar> | ||||
|     { | ||||
|         uchar3 res0; | ||||
|         uchar3 res1; | ||||
|         uchar3 res2; | ||||
|         uchar3 res3; | ||||
|  | ||||
|         __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green) | ||||
|         { | ||||
|             uchar4 patch[3][3]; | ||||
|             patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x]; | ||||
|             patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)]; | ||||
|             patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)]; | ||||
|  | ||||
|             patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x]; | ||||
|             patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)]; | ||||
|             patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)]; | ||||
|  | ||||
|             patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x]; | ||||
|             patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)]; | ||||
|             patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)]; | ||||
|  | ||||
|             if ((s_y & 1) ^ start_with_green) | ||||
|             { | ||||
|                 const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1; | ||||
|                 const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1; | ||||
|  | ||||
|                 const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2; | ||||
|                 const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2; | ||||
|  | ||||
|                 const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1; | ||||
|                 const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1; | ||||
|  | ||||
|                 const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2; | ||||
|                 const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2; | ||||
|  | ||||
|                 if ((s_y & 1) ^ blue_last) | ||||
|                 { | ||||
|                     res0.x = t1; | ||||
|                     res0.y = patch[1][1].x; | ||||
|                     res0.z = t0; | ||||
|  | ||||
|                     res1.x = patch[1][1].y; | ||||
|                     res1.y = t3; | ||||
|                     res1.z = t2; | ||||
|  | ||||
|                     res2.x = t5; | ||||
|                     res2.y = patch[1][1].z; | ||||
|                     res2.z = t4; | ||||
|  | ||||
|                     res3.x = patch[1][1].w; | ||||
|                     res3.y = t7; | ||||
|                     res3.z = t6; | ||||
|                 } | ||||
|                 else | ||||
|                 { | ||||
|                     res0.x = t0; | ||||
|                     res0.y = patch[1][1].x; | ||||
|                     res0.z = t1; | ||||
|  | ||||
|                     res1.x = t2; | ||||
|                     res1.y = t3; | ||||
|                     res1.z = patch[1][1].y; | ||||
|  | ||||
|                     res2.x = t4; | ||||
|                     res2.y = patch[1][1].z; | ||||
|                     res2.z = t5; | ||||
|  | ||||
|                     res3.x = t6; | ||||
|                     res3.y = t7; | ||||
|                     res3.z = patch[1][1].w; | ||||
|                 } | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2; | ||||
|                 const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2; | ||||
|  | ||||
|                 const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1; | ||||
|                 const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1; | ||||
|  | ||||
|                 const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2; | ||||
|                 const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2; | ||||
|  | ||||
|                 const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1; | ||||
|                 const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1; | ||||
|  | ||||
|                 if ((s_y & 1) ^ blue_last) | ||||
|                 { | ||||
|                     res0.x = patch[1][1].x; | ||||
|                     res0.y = t1; | ||||
|                     res0.z = t0; | ||||
|  | ||||
|                     res1.x = t3; | ||||
|                     res1.y = patch[1][1].y; | ||||
|                     res1.z = t2; | ||||
|  | ||||
|                     res2.x = patch[1][1].z; | ||||
|                     res2.y = t5; | ||||
|                     res2.z = t4; | ||||
|  | ||||
|                     res3.x = t7; | ||||
|                     res3.y = patch[1][1].w; | ||||
|                     res3.z = t6; | ||||
|                 } | ||||
|                 else | ||||
|                 { | ||||
|                     res0.x = t0; | ||||
|                     res0.y = t1; | ||||
|                     res0.z = patch[1][1].x; | ||||
|  | ||||
|                     res1.x = t2; | ||||
|                     res1.y = patch[1][1].y; | ||||
|                     res1.z = t3; | ||||
|  | ||||
|                     res2.x = t4; | ||||
|                     res2.y = t5; | ||||
|                     res2.z = patch[1][1].z; | ||||
|  | ||||
|                     res3.x = t6; | ||||
|                     res3.y = patch[1][1].w; | ||||
|                     res3.z = t7; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix); | ||||
|     template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix) | ||||
|     { | ||||
|         typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor(); | ||||
|         return f(pix); | ||||
|     } | ||||
|     template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix) | ||||
|     { | ||||
|         return pix; | ||||
|     } | ||||
|     template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix) | ||||
|     { | ||||
|         return make_uchar4(pix.x, pix.y, pix.z, 255); | ||||
|     } | ||||
|  | ||||
|     template <typename D> | ||||
|     __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green) | ||||
|     { | ||||
|         const int s_x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|         int s_y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|         if (s_y >= src.rows || (s_x << 2) >= src.cols) | ||||
|             return; | ||||
|  | ||||
|         s_y = ::min(::max(s_y, 1), src.rows - 2); | ||||
|  | ||||
|         Bayer2BGR<uchar> bayer; | ||||
|         bayer.apply(src, s_x, s_y, blue_last, start_with_green); | ||||
|  | ||||
|         const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2; | ||||
|         const int d_y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|         dst(d_y, d_x) = toDst<D>(bayer.res0); | ||||
|         if (d_x + 1 < src.cols) | ||||
|             dst(d_y, d_x + 1) = toDst<D>(bayer.res1); | ||||
|         if (d_x + 2 < src.cols) | ||||
|             dst(d_y, d_x + 2) = toDst<D>(bayer.res2); | ||||
|         if (d_x + 3 < src.cols) | ||||
|             dst(d_y, d_x + 3) = toDst<D>(bayer.res3); | ||||
|     } | ||||
|  | ||||
|     template <> struct Bayer2BGR<ushort> | ||||
|     { | ||||
|         ushort3 res0; | ||||
|         ushort3 res1; | ||||
|  | ||||
|         __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green) | ||||
|         { | ||||
|             ushort2 patch[3][3]; | ||||
|             patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x]; | ||||
|             patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)]; | ||||
|             patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)]; | ||||
|  | ||||
|             patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x]; | ||||
|             patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)]; | ||||
|             patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)]; | ||||
|  | ||||
|             patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x]; | ||||
|             patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)]; | ||||
|             patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)]; | ||||
|  | ||||
|             if ((s_y & 1) ^ start_with_green) | ||||
|             { | ||||
|                 const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1; | ||||
|                 const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1; | ||||
|  | ||||
|                 const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2; | ||||
|                 const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2; | ||||
|  | ||||
|                 if ((s_y & 1) ^ blue_last) | ||||
|                 { | ||||
|                     res0.x = t1; | ||||
|                     res0.y = patch[1][1].x; | ||||
|                     res0.z = t0; | ||||
|  | ||||
|                     res1.x = patch[1][1].y; | ||||
|                     res1.y = t3; | ||||
|                     res1.z = t2; | ||||
|                 } | ||||
|                 else | ||||
|                 { | ||||
|                     res0.x = t0; | ||||
|                     res0.y = patch[1][1].x; | ||||
|                     res0.z = t1; | ||||
|  | ||||
|                     res1.x = t2; | ||||
|                     res1.y = t3; | ||||
|                     res1.z = patch[1][1].y; | ||||
|                 } | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2; | ||||
|                 const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2; | ||||
|  | ||||
|                 const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1; | ||||
|                 const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1; | ||||
|  | ||||
|                 if ((s_y & 1) ^ blue_last) | ||||
|                 { | ||||
|                     res0.x = patch[1][1].x; | ||||
|                     res0.y = t1; | ||||
|                     res0.z = t0; | ||||
|  | ||||
|                     res1.x = t3; | ||||
|                     res1.y = patch[1][1].y; | ||||
|                     res1.z = t2; | ||||
|                 } | ||||
|                 else | ||||
|                 { | ||||
|                     res0.x = t0; | ||||
|                     res0.y = t1; | ||||
|                     res0.z = patch[1][1].x; | ||||
|  | ||||
|                     res1.x = t2; | ||||
|                     res1.y = patch[1][1].y; | ||||
|                     res1.z = t3; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix); | ||||
|     template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix) | ||||
|     { | ||||
|         typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor(); | ||||
|         return f(pix); | ||||
|     } | ||||
|     template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix) | ||||
|     { | ||||
|         return pix; | ||||
|     } | ||||
|     template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix) | ||||
|     { | ||||
|         return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max()); | ||||
|     } | ||||
|  | ||||
|     template <typename D> | ||||
|     __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green) | ||||
|     { | ||||
|         const int s_x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|         int s_y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|         if (s_y >= src.rows || (s_x << 1) >= src.cols) | ||||
|             return; | ||||
|  | ||||
|         s_y = ::min(::max(s_y, 1), src.rows - 2); | ||||
|  | ||||
|         Bayer2BGR<ushort> bayer; | ||||
|         bayer.apply(src, s_x, s_y, blue_last, start_with_green); | ||||
|  | ||||
|         const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1; | ||||
|         const int d_y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|         dst(d_y, d_x) = toDst<D>(bayer.res0); | ||||
|         if (d_x + 1 < src.cols) | ||||
|             dst(d_y, d_x + 1) = toDst<D>(bayer.res1); | ||||
|     } | ||||
|  | ||||
|     template <int cn> | ||||
|     void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream) | ||||
|     { | ||||
|         typedef typename TypeVec<uchar, cn>::vec_type dst_t; | ||||
|  | ||||
|         const dim3 block(32, 8); | ||||
|         const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y)); | ||||
|  | ||||
|         cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) ); | ||||
|  | ||||
|         Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green); | ||||
|         cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|         if (stream == 0) | ||||
|             cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|     } | ||||
|  | ||||
|     template <int cn> | ||||
|     void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream) | ||||
|     { | ||||
|         typedef typename TypeVec<ushort, cn>::vec_type dst_t; | ||||
|  | ||||
|         const dim3 block(32, 8); | ||||
|         const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y)); | ||||
|  | ||||
|         cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) ); | ||||
|  | ||||
|         Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green); | ||||
|         cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|         if (stream == 0) | ||||
|             cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|     } | ||||
|  | ||||
|     template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); | ||||
|     template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); | ||||
|     template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); | ||||
|  | ||||
|     template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); | ||||
|     template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); | ||||
|     template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); | ||||
|  | ||||
|     ////////////////////////////////////////////////////////////// | ||||
|     // Bayer Demosaicing (Malvar, He, and Cutler) | ||||
|     // | ||||
|     // by Morgan McGuire, Williams College | ||||
|     // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders | ||||
|     // | ||||
|     // ported to CUDA | ||||
|  | ||||
|     texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp); | ||||
|  | ||||
|     template <typename DstType> | ||||
|     __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed) | ||||
|     { | ||||
|         const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/; | ||||
|         const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ; | ||||
|         const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/; | ||||
|         const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/; | ||||
|         const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ; | ||||
|         const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/; | ||||
|  | ||||
|         const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|         const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|         if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1) | ||||
|             return; | ||||
|  | ||||
|         int2 center; | ||||
|         center.x = x + sourceOffset.x; | ||||
|         center.y = y + sourceOffset.y; | ||||
|  | ||||
|         int4 xCoord; | ||||
|         xCoord.x = center.x - 2; | ||||
|         xCoord.y = center.x - 1; | ||||
|         xCoord.z = center.x + 1; | ||||
|         xCoord.w = center.x + 2; | ||||
|  | ||||
|         int4 yCoord; | ||||
|         yCoord.x = center.y - 2; | ||||
|         yCoord.y = center.y - 1; | ||||
|         yCoord.z = center.y + 1; | ||||
|         yCoord.w = center.y + 2; | ||||
|  | ||||
|         float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0) | ||||
|  | ||||
|         float4 Dvec; | ||||
|         Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1) | ||||
|         Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1) | ||||
|         Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1) | ||||
|         Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1) | ||||
|  | ||||
|         float4 value; | ||||
|         value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0 | ||||
|         value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0 | ||||
|         value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0 | ||||
|         value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0 | ||||
|  | ||||
|         // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1) | ||||
|         value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1 | ||||
|         value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1 | ||||
|         value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1 | ||||
|         value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1 | ||||
|  | ||||
|         float4 PATTERN; | ||||
|         PATTERN.x = kCx * C; | ||||
|         PATTERN.y = kCy * C; | ||||
|         PATTERN.z = kCz * C; | ||||
|         PATTERN.w = PATTERN.z; | ||||
|  | ||||
|         float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w; | ||||
|  | ||||
|         // There are five filter patterns (identity, cross, checker, | ||||
|         // theta, phi). Precompute the terms from all of them and then | ||||
|         // use swizzles to assign to color channels. | ||||
|         // | ||||
|         // Channel Matches | ||||
|         // x cross (e.g., EE G) | ||||
|         // y checker (e.g., EE B) | ||||
|         // z theta (e.g., EO R) | ||||
|         // w phi (e.g., EO B) | ||||
|  | ||||
|         #define A value.x  // A0 + A1 | ||||
|         #define B value.y  // B0 + B1 | ||||
|         #define E value.z  // E0 + E1 | ||||
|         #define F value.w  // F0 + F1 | ||||
|  | ||||
|         float3 temp; | ||||
|  | ||||
|         // PATTERN.yzw += (kD.yz * D).xyy; | ||||
|         temp.x = kDy * D; | ||||
|         temp.y = kDz * D; | ||||
|         PATTERN.y += temp.x; | ||||
|         PATTERN.z += temp.y; | ||||
|         PATTERN.w += temp.y; | ||||
|  | ||||
|         // PATTERN += (kA.xyz * A).xyzx; | ||||
|         temp.x = kAx * A; | ||||
|         temp.y = kAy * A; | ||||
|         temp.z = kAz * A; | ||||
|         PATTERN.x += temp.x; | ||||
|         PATTERN.y += temp.y; | ||||
|         PATTERN.z += temp.z; | ||||
|         PATTERN.w += temp.x; | ||||
|  | ||||
|         // PATTERN += (kE.xyw * E).xyxz; | ||||
|         temp.x = kEx * E; | ||||
|         temp.y = kEy * E; | ||||
|         temp.z = kEw * E; | ||||
|         PATTERN.x += temp.x; | ||||
|         PATTERN.y += temp.y; | ||||
|         PATTERN.z += temp.x; | ||||
|         PATTERN.w += temp.z; | ||||
|  | ||||
|         // PATTERN.xw += kB.xw * B; | ||||
|         PATTERN.x += kBx * B; | ||||
|         PATTERN.w += kBw * B; | ||||
|  | ||||
|         // PATTERN.xz += kF.xz * F; | ||||
|         PATTERN.x += kFx * F; | ||||
|         PATTERN.z += kFz * F; | ||||
|  | ||||
|         // Determine which of four types of pixels we are on. | ||||
|         int2 alternate; | ||||
|         alternate.x = (x + firstRed.x) % 2; | ||||
|         alternate.y = (y + firstRed.y) % 2; | ||||
|  | ||||
|         // in BGR sequence; | ||||
|         uchar3 pixelColor = | ||||
|             (alternate.y == 0) ? | ||||
|                 ((alternate.x == 0) ? | ||||
|                     make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) : | ||||
|                     make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) : | ||||
|                 ((alternate.x == 0) ? | ||||
|                     make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) : | ||||
|                     make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y))); | ||||
|  | ||||
|         dst(y, x) = toDst<DstType>(pixelColor); | ||||
|     } | ||||
|  | ||||
|     template <int cn> | ||||
|     void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream) | ||||
|     { | ||||
|         typedef typename TypeVec<uchar, cn>::vec_type dst_t; | ||||
|  | ||||
|         const dim3 block(32, 8); | ||||
|         const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); | ||||
|  | ||||
|         bindTexture(&sourceTex, src); | ||||
|  | ||||
|         MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed); | ||||
|         cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|         if (stream == 0) | ||||
|             cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|     } | ||||
|  | ||||
|     template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream); | ||||
|     template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream); | ||||
|     template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream); | ||||
| }}} | ||||
|  | ||||
| #endif /* CUDA_DISABLER */ | ||||
							
								
								
									
										143
									
								
								modules/gpuimgproc/src/cuda/gftt.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										143
									
								
								modules/gpuimgproc/src/cuda/gftt.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,143 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include <thrust/device_ptr.h> | ||||
| #include <thrust/sort.h> | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/utility.hpp" | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     namespace gfft | ||||
|     { | ||||
|         texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp); | ||||
|  | ||||
|         __device__ int g_counter = 0; | ||||
|  | ||||
|         template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols) | ||||
|         { | ||||
|             const int j = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int i = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j)) | ||||
|             { | ||||
|                 float val = tex2D(eigTex, j, i); | ||||
|  | ||||
|                 if (val > threshold) | ||||
|                 { | ||||
|                     float maxVal = val; | ||||
|  | ||||
|                     maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal); | ||||
|                     maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal); | ||||
|                     maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal); | ||||
|  | ||||
|                     maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal); | ||||
|                     maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal); | ||||
|  | ||||
|                     maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal); | ||||
|                     maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal); | ||||
|                     maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal); | ||||
|  | ||||
|                     if (val == maxVal) | ||||
|                     { | ||||
|                         const int ind = ::atomicAdd(&g_counter, 1); | ||||
|  | ||||
|                         if (ind < max_count) | ||||
|                             corners[ind] = make_float2(j, i); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count) | ||||
|         { | ||||
|             void* counter_ptr; | ||||
|             cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); | ||||
|  | ||||
|             cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); | ||||
|  | ||||
|             bindTexture(&eigTex, eig); | ||||
|  | ||||
|             dim3 block(16, 16); | ||||
|             dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y)); | ||||
|  | ||||
|             if (mask.data) | ||||
|                 findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols); | ||||
|             else | ||||
|                 findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols); | ||||
|  | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|  | ||||
|             int count; | ||||
|             cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); | ||||
|  | ||||
|             return std::min(count, max_count); | ||||
|         } | ||||
|  | ||||
|         class EigGreater | ||||
|         { | ||||
|         public: | ||||
|             __device__ __forceinline__ bool operator()(float2 a, float2 b) const | ||||
|             { | ||||
|                 return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y); | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|  | ||||
|         void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count) | ||||
|         { | ||||
|             bindTexture(&eigTex, eig); | ||||
|  | ||||
|             thrust::device_ptr<float2> ptr(corners); | ||||
|  | ||||
|             thrust::sort(ptr, ptr + count, EigGreater()); | ||||
|         } | ||||
|     } // namespace optical_flow | ||||
| }}} | ||||
|  | ||||
|  | ||||
| #endif /* CUDA_DISABLER */ | ||||
							
								
								
									
										153
									
								
								modules/gpuimgproc/src/cuda/hist.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										153
									
								
								modules/gpuimgproc/src/cuda/hist.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,153 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/functional.hpp" | ||||
| #include "opencv2/core/cuda/emulation.hpp" | ||||
| #include "opencv2/core/cuda/transform.hpp" | ||||
|  | ||||
| using namespace cv::gpu; | ||||
| using namespace cv::gpu::cudev; | ||||
|  | ||||
| namespace hist | ||||
| { | ||||
|     __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist) | ||||
|     { | ||||
|         __shared__ int shist[256]; | ||||
|  | ||||
|         const int y = blockIdx.x * blockDim.y + threadIdx.y; | ||||
|         const int tid = threadIdx.y * blockDim.x + threadIdx.x; | ||||
|  | ||||
|         shist[tid] = 0; | ||||
|         __syncthreads(); | ||||
|  | ||||
|         if (y < rows) | ||||
|         { | ||||
|             const unsigned int* rowPtr = (const unsigned int*) (src + y * step); | ||||
|  | ||||
|             const int cols_4 = cols / 4; | ||||
|             for (int x = threadIdx.x; x < cols_4; x += blockDim.x) | ||||
|             { | ||||
|                 unsigned int data = rowPtr[x]; | ||||
|  | ||||
|                 Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1); | ||||
|                 Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1); | ||||
|                 Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1); | ||||
|                 Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1); | ||||
|             } | ||||
|  | ||||
|             if (cols % 4 != 0 && threadIdx.x == 0) | ||||
|             { | ||||
|                 for (int x = cols_4 * 4; x < cols; ++x) | ||||
|                 { | ||||
|                     unsigned int data = ((const uchar*)rowPtr)[x]; | ||||
|                     Emulation::smem::atomicAdd(&shist[data], 1); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         __syncthreads(); | ||||
|  | ||||
|         const int histVal = shist[tid]; | ||||
|         if (histVal > 0) | ||||
|             ::atomicAdd(hist + tid, histVal); | ||||
|     } | ||||
|  | ||||
|     void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream) | ||||
|     { | ||||
|         const dim3 block(32, 8); | ||||
|         const dim3 grid(divUp(src.rows, block.y)); | ||||
|  | ||||
|         histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist); | ||||
|         cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|         if (stream == 0) | ||||
|             cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|     } | ||||
| } | ||||
|  | ||||
| ///////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| namespace hist | ||||
| { | ||||
|     __constant__ int c_lut[256]; | ||||
|  | ||||
|     struct EqualizeHist : unary_function<uchar, uchar> | ||||
|     { | ||||
|         float scale; | ||||
|  | ||||
|         __host__ EqualizeHist(float _scale) : scale(_scale) {} | ||||
|  | ||||
|         __device__ __forceinline__ uchar operator ()(uchar val) const | ||||
|         { | ||||
|             const int lut = c_lut[val]; | ||||
|             return __float2int_rn(scale * lut); | ||||
|         } | ||||
|     }; | ||||
| } | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist> | ||||
|     { | ||||
|         enum { smart_shift = 4 }; | ||||
|     }; | ||||
| }}} | ||||
|  | ||||
| namespace hist | ||||
| { | ||||
|     void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream) | ||||
|     { | ||||
|         if (stream == 0) | ||||
|             cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) ); | ||||
|         else | ||||
|             cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) ); | ||||
|  | ||||
|         const float scale = 255.0f / (src.cols * src.rows); | ||||
|  | ||||
|         cudev::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream); | ||||
|     } | ||||
| } | ||||
|  | ||||
| #endif /* CUDA_DISABLER */ | ||||
							
								
								
									
										1709
									
								
								modules/gpuimgproc/src/cuda/hough.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1709
									
								
								modules/gpuimgproc/src/cuda/hough.cu
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										916
									
								
								modules/gpuimgproc/src/cuda/match_template.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										916
									
								
								modules/gpuimgproc/src/cuda/match_template.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,916 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/vec_math.hpp" | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     namespace match_template | ||||
|     { | ||||
|         __device__ __forceinline__ float sum(float v) { return v; } | ||||
|         __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; } | ||||
|         __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; } | ||||
|         __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; } | ||||
|  | ||||
|         __device__ __forceinline__ float first(float v) { return v; } | ||||
|         __device__ __forceinline__ float first(float2 v) { return v.x; } | ||||
|         __device__ __forceinline__ float first(float3 v) { return v.x; } | ||||
|         __device__ __forceinline__ float first(float4 v) { return v.x; } | ||||
|  | ||||
|         __device__ __forceinline__ float mul(float a, float b) { return a * b; } | ||||
|         __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } | ||||
|         __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } | ||||
|         __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } | ||||
|  | ||||
|         __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; } | ||||
|         __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); } | ||||
|         __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } | ||||
|         __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } | ||||
|  | ||||
|         __device__ __forceinline__ float sub(float a, float b) { return a - b; } | ||||
|         __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); } | ||||
|         __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } | ||||
|         __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } | ||||
|  | ||||
|         __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; } | ||||
|         __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); } | ||||
|         __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } | ||||
|         __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } | ||||
|  | ||||
|         ////////////////////////////////////////////////////////////////////// | ||||
|         // Naive_CCORR | ||||
|  | ||||
|         template <typename T, int cn> | ||||
|         __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result) | ||||
|         { | ||||
|             typedef typename TypeVec<T, cn>::vec_type Type; | ||||
|             typedef typename TypeVec<float, cn>::vec_type Typef; | ||||
|  | ||||
|             int x = blockDim.x * blockIdx.x + threadIdx.x; | ||||
|             int y = blockDim.y * blockIdx.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 Typef res = VecTraits<Typef>::all(0); | ||||
|  | ||||
|                 for (int i = 0; i < h; ++i) | ||||
|                 { | ||||
|                     const Type* image_ptr = (const Type*)image.ptr(y + i); | ||||
|                     const Type* templ_ptr = (const Type*)templ.ptr(i); | ||||
|                     for (int j = 0; j < w; ++j) | ||||
|                         res = res + mul(image_ptr[x + j], templ_ptr[j]); | ||||
|                 } | ||||
|  | ||||
|                 result.ptr(y)[x] = sum(res); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         template <typename T, int cn> | ||||
|         void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             const dim3 threads(32, 8); | ||||
|             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|         void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) | ||||
|         { | ||||
|             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); | ||||
|  | ||||
|             static const caller_t callers[] = | ||||
|             { | ||||
|                 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4> | ||||
|             }; | ||||
|  | ||||
|             callers[cn](image, templ, result, stream); | ||||
|         } | ||||
|  | ||||
|  | ||||
|         void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) | ||||
|         { | ||||
|             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); | ||||
|  | ||||
|             static const caller_t callers[] = | ||||
|             { | ||||
|                 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4> | ||||
|             }; | ||||
|  | ||||
|             callers[cn](image, templ, result, stream); | ||||
|         } | ||||
|  | ||||
|         ////////////////////////////////////////////////////////////////////// | ||||
|         // Naive_SQDIFF | ||||
|  | ||||
|         template <typename T, int cn> | ||||
|         __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result) | ||||
|         { | ||||
|             typedef typename TypeVec<T, cn>::vec_type Type; | ||||
|             typedef typename TypeVec<float, cn>::vec_type Typef; | ||||
|  | ||||
|             int x = blockDim.x * blockIdx.x + threadIdx.x; | ||||
|             int y = blockDim.y * blockIdx.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 Typef res = VecTraits<Typef>::all(0); | ||||
|                 Typef delta; | ||||
|  | ||||
|                 for (int i = 0; i < h; ++i) | ||||
|                 { | ||||
|                     const Type* image_ptr = (const Type*)image.ptr(y + i); | ||||
|                     const Type* templ_ptr = (const Type*)templ.ptr(i); | ||||
|                     for (int j = 0; j < w; ++j) | ||||
|                     { | ||||
|                         delta = sub(image_ptr[x + j], templ_ptr[j]); | ||||
|                         res = res + delta * delta; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 result.ptr(y)[x] = sum(res); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         template <typename T, int cn> | ||||
|         void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             const dim3 threads(32, 8); | ||||
|             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|         void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) | ||||
|         { | ||||
|             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); | ||||
|  | ||||
|             static const caller_t callers[] = | ||||
|             { | ||||
|                 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4> | ||||
|             }; | ||||
|  | ||||
|             callers[cn](image, templ, result, stream); | ||||
|         } | ||||
|  | ||||
|         void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) | ||||
|         { | ||||
|             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); | ||||
|  | ||||
|             static const caller_t callers[] = | ||||
|             { | ||||
|                 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4> | ||||
|             }; | ||||
|  | ||||
|             callers[cn](image, templ, result, stream); | ||||
|         } | ||||
|  | ||||
|         ////////////////////////////////////////////////////////////////////// | ||||
|         // Prepared_SQDIFF | ||||
|  | ||||
|         template <int cn> | ||||
|         __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sqsum_ = (float)( | ||||
|                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - | ||||
|                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); | ||||
|                 float ccorr = result.ptr(y)[x]; | ||||
|                 result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         template <int cn> | ||||
|         void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             const dim3 threads(32, 8); | ||||
|             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|         void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn, | ||||
|                                              cudaStream_t stream) | ||||
|         { | ||||
|             typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream); | ||||
|  | ||||
|             static const caller_t callers[] = | ||||
|             { | ||||
|                 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4> | ||||
|             }; | ||||
|  | ||||
|             callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); | ||||
|         } | ||||
|  | ||||
|         ////////////////////////////////////////////////////////////////////// | ||||
|         // Prepared_SQDIFF_NORMED | ||||
|  | ||||
|         // normAcc* are accurate normalization routines which make GPU matchTemplate | ||||
|         // consistent with CPU one | ||||
|  | ||||
|         __device__ float normAcc(float num, float denum) | ||||
|         { | ||||
|             if (::fabs(num) < denum) | ||||
|                 return num / denum; | ||||
|             if (::fabs(num) < denum * 1.125f) | ||||
|                 return num > 0 ? 1 : -1; | ||||
|             return 0; | ||||
|         } | ||||
|  | ||||
|  | ||||
|         __device__ float normAcc_SQDIFF(float num, float denum) | ||||
|         { | ||||
|             if (::fabs(num) < denum) | ||||
|                 return num / denum; | ||||
|             if (::fabs(num) < denum * 1.125f) | ||||
|                 return num > 0 ? 1 : -1; | ||||
|             return 1; | ||||
|         } | ||||
|  | ||||
|  | ||||
|         template <int cn> | ||||
|         __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U( | ||||
|                 int w, int h, const PtrStep<unsigned long long> image_sqsum, | ||||
|                 unsigned long long templ_sqsum, PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sqsum_ = (float)( | ||||
|                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - | ||||
|                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); | ||||
|                 float ccorr = result.ptr(y)[x]; | ||||
|                 result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum, | ||||
|                                                   sqrtf(image_sqsum_ * templ_sqsum)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         template <int cn> | ||||
|         void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, | ||||
|                                                     PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             const dim3 threads(32, 8); | ||||
|             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|  | ||||
|         void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, | ||||
|                                                     PtrStepSzf result, int cn, cudaStream_t stream) | ||||
|         { | ||||
|             typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream); | ||||
|             static const caller_t callers[] = | ||||
|             { | ||||
|                 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4> | ||||
|             }; | ||||
|  | ||||
|             callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); | ||||
|         } | ||||
|  | ||||
|         ////////////////////////////////////////////////////////////////////// | ||||
|         // Prepared_CCOFF | ||||
|  | ||||
|         __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sum_ = (float)( | ||||
|                         (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) - | ||||
|                         (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x])); | ||||
|                 float ccorr = result.ptr(y)[x]; | ||||
|                 result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|  | ||||
|  | ||||
|         __global__ void matchTemplatePreparedKernel_CCOFF_8UC2( | ||||
|                 int w, int h, float templ_sum_scale_r, float templ_sum_scale_g, | ||||
|                 const PtrStep<unsigned int> image_sum_r, | ||||
|                 const PtrStep<unsigned int> image_sum_g, | ||||
|                 PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sum_r_ = (float)( | ||||
|                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - | ||||
|                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); | ||||
|                 float image_sum_g_ = (float)( | ||||
|                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - | ||||
|                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); | ||||
|                 float ccorr = result.ptr(y)[x]; | ||||
|                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r | ||||
|                                          - image_sum_g_ * templ_sum_scale_g; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void matchTemplatePrepared_CCOFF_8UC2( | ||||
|                 int w, int h, | ||||
|                 const PtrStepSz<unsigned int> image_sum_r, | ||||
|                 const PtrStepSz<unsigned int> image_sum_g, | ||||
|                 unsigned int templ_sum_r, unsigned int templ_sum_g, | ||||
|                 PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>( | ||||
|                     w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), | ||||
|                     image_sum_r, image_sum_g, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|  | ||||
|  | ||||
|         __global__ void matchTemplatePreparedKernel_CCOFF_8UC3( | ||||
|                 int w, int h, | ||||
|                 float templ_sum_scale_r, | ||||
|                 float templ_sum_scale_g, | ||||
|                 float templ_sum_scale_b, | ||||
|                 const PtrStep<unsigned int> image_sum_r, | ||||
|                 const PtrStep<unsigned int> image_sum_g, | ||||
|                 const PtrStep<unsigned int> image_sum_b, | ||||
|                 PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sum_r_ = (float)( | ||||
|                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - | ||||
|                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); | ||||
|                 float image_sum_g_ = (float)( | ||||
|                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - | ||||
|                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); | ||||
|                 float image_sum_b_ = (float)( | ||||
|                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - | ||||
|                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); | ||||
|                 float ccorr = result.ptr(y)[x]; | ||||
|                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r | ||||
|                                          - image_sum_g_ * templ_sum_scale_g | ||||
|                                          - image_sum_b_ * templ_sum_scale_b; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void matchTemplatePrepared_CCOFF_8UC3( | ||||
|                 int w, int h, | ||||
|                 const PtrStepSz<unsigned int> image_sum_r, | ||||
|                 const PtrStepSz<unsigned int> image_sum_g, | ||||
|                 const PtrStepSz<unsigned int> image_sum_b, | ||||
|                 unsigned int templ_sum_r, | ||||
|                 unsigned int templ_sum_g, | ||||
|                 unsigned int templ_sum_b, | ||||
|                 PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>( | ||||
|                     w, h, | ||||
|                     (float)templ_sum_r / (w * h), | ||||
|                     (float)templ_sum_g / (w * h), | ||||
|                     (float)templ_sum_b / (w * h), | ||||
|                     image_sum_r, image_sum_g, image_sum_b, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|  | ||||
|  | ||||
|         __global__ void matchTemplatePreparedKernel_CCOFF_8UC4( | ||||
|                 int w, int h, | ||||
|                 float templ_sum_scale_r, | ||||
|                 float templ_sum_scale_g, | ||||
|                 float templ_sum_scale_b, | ||||
|                 float templ_sum_scale_a, | ||||
|                 const PtrStep<unsigned int> image_sum_r, | ||||
|                 const PtrStep<unsigned int> image_sum_g, | ||||
|                 const PtrStep<unsigned int> image_sum_b, | ||||
|                 const PtrStep<unsigned int> image_sum_a, | ||||
|                 PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sum_r_ = (float)( | ||||
|                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - | ||||
|                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); | ||||
|                 float image_sum_g_ = (float)( | ||||
|                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - | ||||
|                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); | ||||
|                 float image_sum_b_ = (float)( | ||||
|                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - | ||||
|                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); | ||||
|                 float image_sum_a_ = (float)( | ||||
|                         (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) - | ||||
|                         (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x])); | ||||
|                 float ccorr = result.ptr(y)[x]; | ||||
|                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r | ||||
|                                          - image_sum_g_ * templ_sum_scale_g | ||||
|                                          - image_sum_b_ * templ_sum_scale_b | ||||
|                                          - image_sum_a_ * templ_sum_scale_a; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void matchTemplatePrepared_CCOFF_8UC4( | ||||
|                 int w, int h, | ||||
|                 const PtrStepSz<unsigned int> image_sum_r, | ||||
|                 const PtrStepSz<unsigned int> image_sum_g, | ||||
|                 const PtrStepSz<unsigned int> image_sum_b, | ||||
|                 const PtrStepSz<unsigned int> image_sum_a, | ||||
|                 unsigned int templ_sum_r, | ||||
|                 unsigned int templ_sum_g, | ||||
|                 unsigned int templ_sum_b, | ||||
|                 unsigned int templ_sum_a, | ||||
|                 PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>( | ||||
|                     w, h, | ||||
|                     (float)templ_sum_r / (w * h), | ||||
|                     (float)templ_sum_g / (w * h), | ||||
|                     (float)templ_sum_b / (w * h), | ||||
|                     (float)templ_sum_a / (w * h), | ||||
|                     image_sum_r, image_sum_g, image_sum_b, image_sum_a, | ||||
|                     result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|         ////////////////////////////////////////////////////////////////////// | ||||
|         // Prepared_CCOFF_NORMED | ||||
|  | ||||
|         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U( | ||||
|                 int w, int h, float weight, | ||||
|                 float templ_sum_scale, float templ_sqsum_scale, | ||||
|                 const PtrStep<unsigned int> image_sum, | ||||
|                 const PtrStep<unsigned long long> image_sqsum, | ||||
|                 PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float ccorr = result.ptr(y)[x]; | ||||
|                 float image_sum_ = (float)( | ||||
|                         (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) - | ||||
|                         (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x])); | ||||
|                 float image_sqsum_ = (float)( | ||||
|                         (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) - | ||||
|                         (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x])); | ||||
|                 result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale, | ||||
|                                            sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_))); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void matchTemplatePrepared_CCOFF_NORMED_8U( | ||||
|                     int w, int h, const PtrStepSz<unsigned int> image_sum, | ||||
|                     const PtrStepSz<unsigned long long> image_sqsum, | ||||
|                     unsigned int templ_sum, unsigned long long templ_sqsum, | ||||
|                     PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             float weight = 1.f / (w * h); | ||||
|             float templ_sum_scale = templ_sum * weight; | ||||
|             float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum; | ||||
|  | ||||
|             matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>( | ||||
|                     w, h, weight, templ_sum_scale, templ_sqsum_scale, | ||||
|                     image_sum, image_sqsum, result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|  | ||||
|  | ||||
|         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2( | ||||
|                 int w, int h, float weight, | ||||
|                 float templ_sum_scale_r, float templ_sum_scale_g, | ||||
|                 float templ_sqsum_scale, | ||||
|                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, | ||||
|                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, | ||||
|                 PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sum_r_ = (float)( | ||||
|                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - | ||||
|                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); | ||||
|                 float image_sqsum_r_ = (float)( | ||||
|                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - | ||||
|                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); | ||||
|                 float image_sum_g_ = (float)( | ||||
|                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - | ||||
|                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); | ||||
|                 float image_sqsum_g_ = (float)( | ||||
|                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - | ||||
|                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); | ||||
|  | ||||
|                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r | ||||
|                                              - image_sum_g_ * templ_sum_scale_g; | ||||
|                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ | ||||
|                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_)); | ||||
|                 result.ptr(y)[x] = normAcc(num, denum); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void matchTemplatePrepared_CCOFF_NORMED_8UC2( | ||||
|                     int w, int h, | ||||
|                     const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, | ||||
|                     const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, | ||||
|                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r, | ||||
|                     unsigned int templ_sum_g, unsigned long long templ_sqsum_g, | ||||
|                     PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             float weight = 1.f / (w * h); | ||||
|             float templ_sum_scale_r = templ_sum_r * weight; | ||||
|             float templ_sum_scale_g = templ_sum_g * weight; | ||||
|             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r | ||||
|                                        + templ_sqsum_g - weight * templ_sum_g * templ_sum_g; | ||||
|  | ||||
|             matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>( | ||||
|                     w, h, weight, | ||||
|                     templ_sum_scale_r, templ_sum_scale_g, | ||||
|                     templ_sqsum_scale, | ||||
|                     image_sum_r, image_sqsum_r, | ||||
|                     image_sum_g, image_sqsum_g, | ||||
|                     result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|  | ||||
|  | ||||
|         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3( | ||||
|                 int w, int h, float weight, | ||||
|                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, | ||||
|                 float templ_sqsum_scale, | ||||
|                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, | ||||
|                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, | ||||
|                 const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b, | ||||
|                 PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sum_r_ = (float)( | ||||
|                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - | ||||
|                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); | ||||
|                 float image_sqsum_r_ = (float)( | ||||
|                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - | ||||
|                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); | ||||
|                 float image_sum_g_ = (float)( | ||||
|                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - | ||||
|                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); | ||||
|                 float image_sqsum_g_ = (float)( | ||||
|                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - | ||||
|                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); | ||||
|                 float image_sum_b_ = (float)( | ||||
|                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - | ||||
|                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); | ||||
|                 float image_sqsum_b_ = (float)( | ||||
|                         (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) - | ||||
|                         (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x])); | ||||
|  | ||||
|                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r | ||||
|                                              - image_sum_g_ * templ_sum_scale_g | ||||
|                                              - image_sum_b_ * templ_sum_scale_b; | ||||
|                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ | ||||
|                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_ | ||||
|                                                          + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_)); | ||||
|                 result.ptr(y)[x] = normAcc(num, denum); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void matchTemplatePrepared_CCOFF_NORMED_8UC3( | ||||
|                     int w, int h, | ||||
|                     const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, | ||||
|                     const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, | ||||
|                     const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b, | ||||
|                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r, | ||||
|                     unsigned int templ_sum_g, unsigned long long templ_sqsum_g, | ||||
|                     unsigned int templ_sum_b, unsigned long long templ_sqsum_b, | ||||
|                     PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             float weight = 1.f / (w * h); | ||||
|             float templ_sum_scale_r = templ_sum_r * weight; | ||||
|             float templ_sum_scale_g = templ_sum_g * weight; | ||||
|             float templ_sum_scale_b = templ_sum_b * weight; | ||||
|             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r | ||||
|                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g | ||||
|                                       + templ_sqsum_b - weight * templ_sum_b * templ_sum_b; | ||||
|  | ||||
|             matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>( | ||||
|                     w, h, weight, | ||||
|                     templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, | ||||
|                     templ_sqsum_scale, | ||||
|                     image_sum_r, image_sqsum_r, | ||||
|                     image_sum_g, image_sqsum_g, | ||||
|                     image_sum_b, image_sqsum_b, | ||||
|                     result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|  | ||||
|  | ||||
|         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4( | ||||
|                 int w, int h, float weight, | ||||
|                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, | ||||
|                 float templ_sum_scale_a, float templ_sqsum_scale, | ||||
|                 const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, | ||||
|                 const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, | ||||
|                 const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b, | ||||
|                 const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a, | ||||
|                 PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sum_r_ = (float)( | ||||
|                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - | ||||
|                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); | ||||
|                 float image_sqsum_r_ = (float)( | ||||
|                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - | ||||
|                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); | ||||
|                 float image_sum_g_ = (float)( | ||||
|                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - | ||||
|                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); | ||||
|                 float image_sqsum_g_ = (float)( | ||||
|                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - | ||||
|                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); | ||||
|                 float image_sum_b_ = (float)( | ||||
|                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - | ||||
|                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); | ||||
|                 float image_sqsum_b_ = (float)( | ||||
|                         (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) - | ||||
|                         (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x])); | ||||
|                 float image_sum_a_ = (float)( | ||||
|                         (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) - | ||||
|                         (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x])); | ||||
|                 float image_sqsum_a_ = (float)( | ||||
|                         (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) - | ||||
|                         (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x])); | ||||
|  | ||||
|                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g | ||||
|                                              - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a; | ||||
|                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ | ||||
|                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_ | ||||
|                                                          + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_ | ||||
|                                                          + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_)); | ||||
|                 result.ptr(y)[x] = normAcc(num, denum); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void matchTemplatePrepared_CCOFF_NORMED_8UC4( | ||||
|                     int w, int h, | ||||
|                     const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, | ||||
|                     const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, | ||||
|                     const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b, | ||||
|                     const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a, | ||||
|                     unsigned int templ_sum_r, unsigned long long templ_sqsum_r, | ||||
|                     unsigned int templ_sum_g, unsigned long long templ_sqsum_g, | ||||
|                     unsigned int templ_sum_b, unsigned long long templ_sqsum_b, | ||||
|                     unsigned int templ_sum_a, unsigned long long templ_sqsum_a, | ||||
|                     PtrStepSzf result, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             float weight = 1.f / (w * h); | ||||
|             float templ_sum_scale_r = templ_sum_r * weight; | ||||
|             float templ_sum_scale_g = templ_sum_g * weight; | ||||
|             float templ_sum_scale_b = templ_sum_b * weight; | ||||
|             float templ_sum_scale_a = templ_sum_a * weight; | ||||
|             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r | ||||
|                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g | ||||
|                                       + templ_sqsum_b - weight * templ_sum_b * templ_sum_b | ||||
|                                       + templ_sqsum_a - weight * templ_sum_a * templ_sum_a; | ||||
|  | ||||
|             matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>( | ||||
|                     w, h, weight, | ||||
|                     templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, | ||||
|                     templ_sqsum_scale, | ||||
|                     image_sum_r, image_sqsum_r, | ||||
|                     image_sum_g, image_sqsum_g, | ||||
|                     image_sum_b, image_sqsum_b, | ||||
|                     image_sum_a, image_sqsum_a, | ||||
|                     result); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|         ////////////////////////////////////////////////////////////////////// | ||||
|         // normalize | ||||
|  | ||||
|         template <int cn> | ||||
|         __global__ void normalizeKernel_8U( | ||||
|                 int w, int h, const PtrStep<unsigned long long> image_sqsum, | ||||
|                 unsigned long long templ_sqsum, PtrStepSzf result) | ||||
|         { | ||||
|             const int x = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             const int y = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 float image_sqsum_ = (float)( | ||||
|                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - | ||||
|                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); | ||||
|                 result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, | ||||
|                           unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             switch (cn) | ||||
|             { | ||||
|             case 1: | ||||
|                 normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); | ||||
|                 break; | ||||
|             case 2: | ||||
|                 normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); | ||||
|                 break; | ||||
|             case 3: | ||||
|                 normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); | ||||
|                 break; | ||||
|             case 4: | ||||
|                 normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); | ||||
|                 break; | ||||
|             } | ||||
|  | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|         ////////////////////////////////////////////////////////////////////// | ||||
|         // extractFirstChannel | ||||
|  | ||||
|         template <int cn> | ||||
|         __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result) | ||||
|         { | ||||
|             typedef typename TypeVec<float, cn>::vec_type Typef; | ||||
|  | ||||
|             int x = blockDim.x * blockIdx.x + threadIdx.x; | ||||
|             int y = blockDim.y * blockIdx.y + threadIdx.y; | ||||
|  | ||||
|             if (x < result.cols && y < result.rows) | ||||
|             { | ||||
|                 Typef val = ((const Typef*)image.ptr(y))[x]; | ||||
|                 result.ptr(y)[x] = first(val); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 threads(32, 8); | ||||
|             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); | ||||
|  | ||||
|             switch (cn) | ||||
|             { | ||||
|             case 1: | ||||
|                 extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result); | ||||
|                 break; | ||||
|             case 2: | ||||
|                 extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result); | ||||
|                 break; | ||||
|             case 3: | ||||
|                 extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result); | ||||
|                 break; | ||||
|             case 4: | ||||
|                 extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result); | ||||
|                 break; | ||||
|             } | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|     } //namespace match_template | ||||
| }}} // namespace cv { namespace gpu { namespace cudev | ||||
|  | ||||
|  | ||||
| #endif /* CUDA_DISABLER */ | ||||
							
								
								
									
										182
									
								
								modules/gpuimgproc/src/cuda/mean_shift.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										182
									
								
								modules/gpuimgproc/src/cuda/mean_shift.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,182 @@ | ||||
| /*M/////////////////////////////////////////////////////////////////////////////////////// | ||||
| // | ||||
| //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. | ||||
| // | ||||
| //  By downloading, copying, installing or using the software you agree to this license. | ||||
| //  If you do not agree to this license, do not download, install, | ||||
| //  copy or use the software. | ||||
| // | ||||
| // | ||||
| //                           License Agreement | ||||
| //                For Open Source Computer Vision Library | ||||
| // | ||||
| // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. | ||||
| // Copyright (C) 2009, Willow Garage Inc., all rights reserved. | ||||
| // Third party copyrights are property of their respective owners. | ||||
| // | ||||
| // Redistribution and use in source and binary forms, with or without modification, | ||||
| // are permitted provided that the following conditions are met: | ||||
| // | ||||
| //   * Redistribution's of source code must retain the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer. | ||||
| // | ||||
| //   * Redistribution's in binary form must reproduce the above copyright notice, | ||||
| //     this list of conditions and the following disclaimer in the documentation | ||||
| //     and/or other materials provided with the distribution. | ||||
| // | ||||
| //   * The name of the copyright holders may not be used to endorse or promote products | ||||
| //     derived from this software without specific prior written permission. | ||||
| // | ||||
| // This software is provided by the copyright holders and contributors "as is" and | ||||
| // any express or implied warranties, including, but not limited to, the implied | ||||
| // warranties of merchantability and fitness for a particular purpose are disclaimed. | ||||
| // In no event shall the Intel Corporation or contributors be liable for any direct, | ||||
| // indirect, incidental, special, exemplary, or consequential damages | ||||
| // (including, but not limited to, procurement of substitute goods or services; | ||||
| // loss of use, data, or profits; or business interruption) however caused | ||||
| // and on any theory of liability, whether in contract, strict liability, | ||||
| // or tort (including negligence or otherwise) arising in any way out of | ||||
| // the use of this software, even if advised of the possibility of such damage. | ||||
| // | ||||
| //M*/ | ||||
|  | ||||
| #if !defined CUDA_DISABLER | ||||
|  | ||||
| #include "opencv2/core/cuda/common.hpp" | ||||
| #include "opencv2/core/cuda/vec_traits.hpp" | ||||
| #include "opencv2/core/cuda/vec_math.hpp" | ||||
| #include "opencv2/core/cuda/saturate_cast.hpp" | ||||
| #include "opencv2/core/cuda/border_interpolate.hpp" | ||||
|  | ||||
| namespace cv { namespace gpu { namespace cudev | ||||
| { | ||||
|     namespace imgproc | ||||
|     { | ||||
|         texture<uchar4, 2> tex_meanshift; | ||||
|  | ||||
|         __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, | ||||
|                                         size_t out_step, int cols, int rows, | ||||
|                                         int sp, int sr, int maxIter, float eps) | ||||
|         { | ||||
|             int isr2 = sr*sr; | ||||
|             uchar4 c = tex2D(tex_meanshift, x0, y0 ); | ||||
|  | ||||
|             // iterate meanshift procedure | ||||
|             for( int iter = 0; iter < maxIter; iter++ ) | ||||
|             { | ||||
|                 int count = 0; | ||||
|                 int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0; | ||||
|                 float icount; | ||||
|  | ||||
|                 //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp) | ||||
|                 int minx = x0-sp; | ||||
|                 int miny = y0-sp; | ||||
|                 int maxx = x0+sp; | ||||
|                 int maxy = y0+sp; | ||||
|  | ||||
|                 for( int y = miny; y <= maxy; y++) | ||||
|                 { | ||||
|                     int rowCount = 0; | ||||
|                     for( int x = minx; x <= maxx; x++ ) | ||||
|                     { | ||||
|                         uchar4 t = tex2D( tex_meanshift, x, y ); | ||||
|  | ||||
|                         int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z); | ||||
|                         if( norm2 <= isr2 ) | ||||
|                         { | ||||
|                             s0 += t.x; s1 += t.y; s2 += t.z; | ||||
|                             sx += x; rowCount++; | ||||
|                         } | ||||
|                     } | ||||
|                     count += rowCount; | ||||
|                     sy += y*rowCount; | ||||
|                 } | ||||
|  | ||||
|                 if( count == 0 ) | ||||
|                     break; | ||||
|  | ||||
|                 icount = 1.f/count; | ||||
|                 int x1 = __float2int_rz(sx*icount); | ||||
|                 int y1 = __float2int_rz(sy*icount); | ||||
|                 s0 = __float2int_rz(s0*icount); | ||||
|                 s1 = __float2int_rz(s1*icount); | ||||
|                 s2 = __float2int_rz(s2*icount); | ||||
|  | ||||
|                 int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z); | ||||
|  | ||||
|                 bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps); | ||||
|  | ||||
|                 x0 = x1; y0 = y1; | ||||
|                 c.x = s0; c.y = s1; c.z = s2; | ||||
|  | ||||
|                 if( stopFlag ) | ||||
|                     break; | ||||
|             } | ||||
|  | ||||
|             int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar); | ||||
|             *(uchar4*)(out + base) = c; | ||||
|  | ||||
|             return make_short2((short)x0, (short)y0); | ||||
|         } | ||||
|  | ||||
|         __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps ) | ||||
|         { | ||||
|             int x0 = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             int y0 = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if( x0 < cols && y0 < rows ) | ||||
|                 do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps); | ||||
|         } | ||||
|  | ||||
|         void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 grid(1, 1, 1); | ||||
|             dim3 threads(32, 8, 1); | ||||
|             grid.x = divUp(src.cols, threads.x); | ||||
|             grid.y = divUp(src.rows, threads.y); | ||||
|  | ||||
|             cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>(); | ||||
|             cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); | ||||
|  | ||||
|             meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|  | ||||
|         __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, | ||||
|                                              unsigned char* outsp, size_t outspstep, | ||||
|                                              int cols, int rows, | ||||
|                                              int sp, int sr, int maxIter, float eps) | ||||
|         { | ||||
|             int x0 = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|             int y0 = blockIdx.y * blockDim.y + threadIdx.y; | ||||
|  | ||||
|             if( x0 < cols && y0 < rows ) | ||||
|             { | ||||
|                 int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short); | ||||
|                 *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) | ||||
|         { | ||||
|             dim3 grid(1, 1, 1); | ||||
|             dim3 threads(32, 8, 1); | ||||
|             grid.x = divUp(src.cols, threads.x); | ||||
|             grid.y = divUp(src.rows, threads.y); | ||||
|  | ||||
|             cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>(); | ||||
|             cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) ); | ||||
|  | ||||
|             meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); | ||||
|             cudaSafeCall( cudaGetLastError() ); | ||||
|  | ||||
|             if (stream == 0) | ||||
|                 cudaSafeCall( cudaDeviceSynchronize() ); | ||||
|         } | ||||
|     } | ||||
| }}} | ||||
|  | ||||
| #endif | ||||
		Reference in New Issue
	
	Block a user
	 Vladislav Vinogradov
					Vladislav Vinogradov