moved GpuMat's operations implementation to core module

2011-11-14 14:34:36 +00:00
parent 0f53f2993e
commit 2695039a79
34 changed files with 825 additions and 606 deletions
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -49,36 +49,6 @@
 #include "opencv2/gpu/devmem2d.hpp"
 #include "safe_call.hpp"

-#ifndef CV_PI
-#define CV_PI   3.1415926535897932384626433832795
-#endif
-
-#ifndef CV_PI_F
-  #ifndef CV_PI
-    #define CV_PI_F 3.14159265f
-  #else
-    #define CV_PI_F ((float)CV_PI)
-  #endif
-#endif
-
-#ifdef __CUDACC__
-
-namespace cv { namespace gpu { namespace device 
-{
-    typedef unsigned char uchar;
-    typedef unsigned short ushort;
-    typedef signed char schar;
-    typedef unsigned int uint;
-
-    template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
-    {
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-        cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-    }
-}}}
-
-#endif
-
 namespace cv { namespace gpu 
 {
    enum 
@@ -94,8 +64,6 @@ namespace cv { namespace gpu
    // Returns true if the GPU analogue exists, false otherwise.
    bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);

-    static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
-
    class NppStreamHandler
    {
    public:
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -1,347 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/transform.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-
-namespace cv { namespace gpu { namespace device 
-{
-    template <typename T> struct shift_and_sizeof;
-    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
-    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
-    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
-    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
-    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
-    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
-    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
-
-    ///////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////// CopyTo /////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
-
-    template<typename T>
-    __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if ((x < cols * channels ) && (y < rows))
-            if (mask[y * step_mask + x / channels] != 0)
-            {
-                size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
-                mat_dst[idx] = mat_src[idx];
-            }
-    }
-
-    template<typename T>
-    void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
-    {
-        dim3 threadsPerBlock(16,16, 1);
-        dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
-
-        copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
-                ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall ( cudaDeviceSynchronize() );
-    }
-
-    void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
-    {
-        typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
-
-        static CopyToFunc tab[8] =
-        {
-            copy_to_with_mask_run<unsigned char>,
-            copy_to_with_mask_run<signed char>,
-            copy_to_with_mask_run<unsigned short>,
-            copy_to_with_mask_run<short>,
-            copy_to_with_mask_run<int>,
-            copy_to_with_mask_run<float>,
-            copy_to_with_mask_run<double>,
-            0
-        };
-
-        CopyToFunc func = tab[depth];
-
-        if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
-
-        func(mat_src, mat_dst, mask, channels, stream);
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////// SetTo //////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
-
-    __constant__ uchar scalar_8u[4];
-    __constant__ schar scalar_8s[4];
-    __constant__ ushort scalar_16u[4];
-    __constant__ short scalar_16s[4];
-    __constant__ int scalar_32s[4];
-    __constant__ float scalar_32f[4]; 
-    __constant__ double scalar_64f[4];
-
-    template <typename T> __device__ __forceinline__ T readScalar(int i);
-    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
-    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
-    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
-    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
-    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
-    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
-    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
-
-    void writeScalar(const uchar* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
-    }
-    void writeScalar(const schar* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
-    }
-    void writeScalar(const ushort* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
-    }
-    void writeScalar(const short* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
-    }
-    void writeScalar(const int* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
-    }
-    void writeScalar(const float* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
-    }
-    void writeScalar(const double* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
-    }
-
-    template<typename T>
-    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if ((x < cols * channels ) && (y < rows))
-        {
-            size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-            mat[idx] = readScalar<T>(x % channels);
-        }
-    }
-
-    template<typename T>
-    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if ((x < cols * channels ) && (y < rows))
-            if (mask[y * step_mask + x / channels] != 0)
-            {
-                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-                mat[idx] = readScalar<T>(x % channels);
-            }
-    }
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
-    {
-        writeScalar(scalar);
-
-        dim3 threadsPerBlock(32, 8, 1);
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-
-        set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall ( cudaDeviceSynchronize() );
-    }
-
-    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
-    {
-        writeScalar(scalar);
-
-        dim3 threadsPerBlock(32, 8, 1);
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-
-        set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall ( cudaDeviceSynchronize() );
-    }
-
-    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
-
-    ///////////////////////////////////////////////////////////////////////////
-    //////////////////////////////// ConvertTo ////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
-
-    template <typename T, typename D> struct Convertor : unary_function<T, D>
-    {
-        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
-
-        __device__ __forceinline__ D operator()(const T& src) const
-        {
-            return saturate_cast<D>(alpha * src + beta);
-        }
-
-        const double alpha, beta;
-    };
-
-    namespace detail
-    {
-        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
-        {
-        };
-        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 8 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 4 };
-        };
-
-        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 2 };
-        };
-
-        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 2 };
-        };
-
-        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
-        {
-        };
-    }
-
-    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
-    {
-    };
-        
-    template<typename T, typename D>
-    void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
-    {
-        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
-        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-        Convertor<T, D> op(alpha, beta);
-        ::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
-    }
-
-    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, 
-        cudaStream_t stream = 0)
-    {
-        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, 
-            cudaStream_t stream);
-
-        static const caller_t tab[8][8] =
-        {
-            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
-
-            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
-
-            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
-
-            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
-
-            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
-
-            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
-
-            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
-
-            {0,0,0,0,0,0,0,0}
-        };
-
-        caller_t func = tab[sdepth][ddepth];
-        if (!func)
-            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
-
-        func(src, dst, alpha, beta, stream);
-    }
-}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -69,36 +69,36 @@ namespace cv { namespace gpu
    void ncvError(int err, const char *file, const int line, const char *func = "");
    void cufftError(int err, const char *file, const int line, const char *func = "");
    void cublasError(int err, const char *file, const int line, const char *func = "");
-
-    static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-    {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
-    static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-            cv::gpu::nppError(err, file, line, func);
-    }
-
-    static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (NCV_SUCCESS != err)
-            cv::gpu::ncvError(err, file, line, func);
-    }
-
-    static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-    {
-        if (CUFFT_SUCCESS != err)
-            cv::gpu::cufftError(err, file, line, func);
-    }
-
-    static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
-    {
-        if (CUBLAS_STATUS_SUCCESS != err)
-            cv::gpu::cublasError(err, file, line, func);
-    }
 }}

+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err)
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);
+}
+
+static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (err < 0)
+        cv::gpu::nppError(err, file, line, func);
+}
+
+static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (NCV_SUCCESS != err)
+        cv::gpu::ncvError(err, file, line, func);
+}
+
+static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+{
+    if (CUFFT_SUCCESS != err)
+        cv::gpu::cufftError(err, file, line, func);
+}
+
+static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
+{
+    if (CUBLAS_STATUS_SUCCESS != err)
+        cv::gpu::cublasError(err, file, line, func);
+}
+
 #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -71,19 +71,13 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }

 #include "opencv2/gpu/stream_accessor.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu
 {
-    void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);
-
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-
-    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
-}}}
-
-using namespace ::cv::gpu::device;
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+}}

 struct Stream::Impl
 {
@@ -99,20 +93,6 @@ namespace
        size_t bwidth = src.cols * src.elemSize();
        cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );
    };
-
-    template <typename T>
-    void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        set_to_gpu(src, sf.val, src.channels(), stream);
-    }
-
-    template <typename T>
-    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        set_to_gpu(src, sf.val, mask, src.channels(), stream);
-    }
 }

 CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl ? stream.impl->stream : 0; };
@@ -208,13 +188,7 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
        }
    }

-    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
-    static const set_caller_t set_callers[] =
-    {
-        kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>,
-        kernelSet<int>, kernelSet<float>, kernelSet<double>
-    };
-    set_callers[src.depth()](src, s, impl->stream);
+    setTo(src, s, impl->stream);
 }

 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
@@ -224,13 +198,7 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)

    CV_Assert(mask.type() == CV_8UC1);

-    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream);
-    static const set_caller_t set_callers[] =
-    {
-        kernelSetMask<uchar>, kernelSetMask<schar>, kernelSetMask<ushort>, kernelSetMask<short>,
-        kernelSetMask<int>, kernelSetMask<float>, kernelSetMask<double>
-    };
-    set_callers[src.depth()](src, val, mask, impl->stream);
+    setTo(src, val, mask, impl->stream);
 }

 void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
@@ -258,7 +226,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
        psrc = &(temp = src);

    dst.create( src.size(), rtype );
-    convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
+    convertTo(src, dst, alpha, beta, impl->stream);
 }

 cv::gpu::Stream::operator bool() const
--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -220,22 +220,6 @@ namespace cv
 {
    namespace gpu
    {
-        void error(const char *error_string, const char *file, const int line, const char *func)
-        {          
-            int code = CV_GpuApiCallError;
-
-            if (uncaught_exception())
-            {
-                const char* errorStr = cvErrorStr(code);            
-                const char* function = func ? func : "unknown function";    
-
-                cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
-                cerr.flush();            
-            }
-            else    
-                cv::error( cv::Exception(code, error_string, func, file, line) );
-        }
-
        void nppError(int code, const char *file, const int line, const char *func)
        {
            string msg = getErrorString(code, npp_errors, npp_error_num);
--- a/modules/gpu/src/initialization.cpp
+++ b/modules/gpu/src/initialization.cpp
@@ -271,379 +271,5 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
        setDevice(prev_device_id);
 }

-////////////////////////////////////////////////////////////////////
-// GpuFuncTable
-
-namespace cv { namespace gpu { namespace device 
-{
-    void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
-
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-
-    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
-}}}
-
-namespace
-{
-    //////////////////////////////////////////////////////////////////////////
-    // Convert
-
-    template<int n> struct NPPTypeTraits;
-    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
-    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
-    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
-    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
-    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
-
-    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
-    };
-
-    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void cvt(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void cvt(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
-    {
-        ::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // Set
-    
-    template<int SDEPTH, int SCN> struct NppSetFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void set(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void set(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename T>
-    void kernelSet(GpuMat& src, Scalar s)
-    {
-        Scalar_<T> sf = s;
-        ::cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), 0);
-    }
-
-    template<int SDEPTH, int SCN> struct NppSetMaskFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void set(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void set(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename T>
-    void kernelSetMask(GpuMat& src, Scalar s, const GpuMat& mask)
-    {
-        Scalar_<T> sf = s;
-        ::cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
-    }
-
-    class CudaFuncTable : public GpuFuncTable
-    {
-    public:
-        void copy(const Mat& src, GpuMat& dst) const 
-        { 
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
-        }
-        void copy(const GpuMat& src, Mat& dst) const
-        { 
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
-        }
-        void copy(const GpuMat& src, GpuMat& dst) const
-        { 
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
-        }
-
-        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const 
-        { 
-            ::cv::gpu::device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
-        }
-
-        void convert(const GpuMat& src, GpuMat& dst) const 
-        { 
-            typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);
-            static const caller_t callers[7][7][7] =
-            {
-                {                
-                    /*  8U ->  8U */ {0, 0, 0, 0},
-                    /*  8U ->  8S */ {convertToKernelCaller, convertToKernelCaller, convertToKernelCaller, convertToKernelCaller},
-                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
-                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
-                    /*  8U -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /*  8U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-                },
-                {
-                    /*  8S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /*  8S ->  8S */ {0,0,0,0},
-                    /*  8S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /*  8S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /*  8S -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /*  8S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /*  8S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-                },
-                {
-                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
-                    /* 16U ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 16U -> 16U */ {0,0,0,0},
-                    /* 16U -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 16U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-                },
-                {
-                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
-                    /* 16S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 16S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 16S -> 16S */ {0,0,0,0},
-                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 16S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-                },
-                {
-                    /* 32S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32S -> 32S */ {0,0,0,0},
-                    /* 32S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-                },
-                {
-                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 32F -> 32F */ {0,0,0,0},
-                    /* 32F -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-                },
-                {
-                    /* 64F ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 64F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 64F -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 64F -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 64F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 64F -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                    /* 64F -> 64F */ {0,0,0,0}
-                }
-            };
-
-            caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];
-            CV_DbgAssert(func != 0);
-
-            func(src, dst);
-        }
-
-        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const 
-        { 
-            ::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
-        }
-
-        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
-        {
-            NppiSize sz;
-            sz.width  = m.cols;
-            sz.height = m.rows;
-
-            if (mask.empty())
-            {
-                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
-                {
-                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
-                    return;
-                }
-
-                if (m.depth() == CV_8U)
-                {
-                    int cn = m.channels();
-
-                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-                    {
-                        int val = saturate_cast<uchar>(s[0]);
-                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
-                        return;
-                    }
-                }
-
-                typedef void (*caller_t)(GpuMat& src, Scalar s);
-                static const caller_t callers[7][4] =
-                {
-                    {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
-                    {kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
-                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
-                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
-                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
-                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
-                    {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}
-                };
-
-                callers[m.depth()][m.channels() - 1](m, s);
-            }
-            else
-            {
-                typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask);
-
-                static const caller_t callers[7][4] =
-                {
-                    {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<uchar>,kernelSetMask<uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
-                    {kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>},
-                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<ushort>,kernelSetMask<ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
-                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
-                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
-                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
-                    {kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>}
-                };
-
-                callers[m.depth()][m.channels() - 1](m, s, mask);
-            }
-        }
-
-        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
-        {
-            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
-        }
-
-        void free(void* devPtr) const
-        {
-            cudaFree(devPtr);
-        }
-    };
-
-    class Initializer
-    {
-    public:
-        Initializer()
-        {
-            static CudaFuncTable funcTable;
-            setGpuFuncTable(&funcTable);
-        }
-    };
-
-    Initializer init;
-}
-
 #endif

--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -43,7 +43,6 @@
 #ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
 #define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__

-#include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 #include "vec_traits.hpp"
 #include "vec_math.hpp"
--- a/modules/gpu/src/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/color.hpp
@@ -43,7 +43,6 @@
 #ifndef __OPENCV_GPU_COLOR_HPP__
 #define __OPENCV_GPU_COLOR_HPP__

-#include "internal_shared.hpp"
 #include "detail/color_detail.hpp"

 namespace cv { namespace gpu { namespace device 
--- a/modules/gpu/src/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/common.hpp
@@ -0,0 +1,100 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_COMMON_HPP__
+#define __OPENCV_GPU_COMMON_HPP__
+
+#include <cuda_runtime.h>
+#include "opencv2/core/devmem2d.hpp"
+
+#ifndef CV_PI
+    #define CV_PI   3.1415926535897932384626433832795
+#endif
+
+#ifndef CV_PI_F
+    #ifndef CV_PI
+        #define CV_PI_F 3.14159265f
+    #else
+        #define CV_PI_F ((float)CV_PI)
+    #endif
+#endif
+
+namespace cv { namespace gpu 
+{     
+    __host__ __device__ __forceinline__ int divUp(int total, int grain) 
+    { 
+        return (total + grain - 1) / grain; 
+    }
+
+    namespace device 
+    {
+        typedef unsigned char uchar;
+        typedef unsigned short ushort;
+        typedef signed char schar;
+        typedef unsigned int uint;
+
+        template<class T> inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+        }
+    }
+}}
+
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+namespace cv { namespace gpu 
+{
+    void error(const char *error_string, const char *file, const int line, const char *func = "");
+}}
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err)
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);
+}
+
+#endif // __OPENCV_GPU_COMMON_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_DATAMOV_UTILS_HPP__
 #define __OPENCV_GPU_DATAMOV_UTILS_HPP__

-#include "internal_shared.hpp"
+#include "common.hpp"

 namespace cv { namespace gpu { namespace device 
 {
--- a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_COLOR_DETAIL_HPP__
 #define __OPENCV_GPU_COLOR_DETAIL_HPP__

-#include "internal_shared.hpp"
+#include "../common.hpp"
 #include "../vec_traits.hpp"
 #include "../saturate_cast.hpp"
 #include "../limits.hpp"
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
 #define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__

-#include "internal_shared.hpp"
+#include "../common.hpp"
 #include "../vec_traits.hpp"
 #include "../functional.hpp"

--- a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
 #define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__

-#include "internal_shared.hpp"
+#include "../common.hpp"
 #include "../vec_traits.hpp"

 namespace cv { namespace gpu { namespace device 
--- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
@@ -43,8 +43,6 @@
 #ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__
 #define __OPENCV_GPU_UTILITY_DETAIL_HPP__

-#include "internal_shared.hpp"
-
 namespace cv { namespace gpu { namespace device 
 {
    namespace utility_detail
--- a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
@@ -43,7 +43,6 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__

-#include "internal_shared.hpp"
 #include "../datamov_utils.hpp"

 namespace cv { namespace gpu { namespace device 
--- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
@@ -43,41 +43,38 @@
 #ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
 #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__

-#include "internal_shared.hpp"
-
-BEGIN_OPENCV_DEVICE_NAMESPACE
-   
-template<class T> struct DynamicSharedMem
-{
-    __device__ __forceinline__ operator T*()
+namespace cv { namespace gpu { namespace device
+{   
+    template<class T> struct DynamicSharedMem
    {
-        extern __shared__ int __smem[];
-        return (T*)__smem;
-    }
+        __device__ __forceinline__ operator T*()
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }

-    __device__ __forceinline__ operator const T*() const
+        __device__ __forceinline__ operator const T*() const
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+    };
+
+    // specialize for double to avoid unaligned memory access compile errors
+    template<> struct DynamicSharedMem<double>
    {
-        extern __shared__ int __smem[];
-        return (T*)__smem;
-    }
-};
+        __device__ __forceinline__ operator double*()
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }

-// specialize for double to avoid unaligned memory access compile errors
-template<> struct DynamicSharedMem<double>
-{
-    __device__ __forceinline__ operator double*()
-    {
-        extern __shared__ double __smem_d[];
-        return (double*)__smem_d;
-    }
-
-    __device__ __forceinline__ operator const double*() const
-    {
-        extern __shared__ double __smem_d[];
-        return (double*)__smem_d;
-    }
-};
-
-END_OPENCV_DEVICE_NAMESPACE
+        __device__ __forceinline__ operator const double*() const
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+    };
+}}}

 #endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@@ -43,7 +43,6 @@
 #ifndef OPENCV_GPU_EMULATION_HPP_
 #define OPENCV_GPU_EMULATION_HPP_

-#include "internal_shared.hpp"
 #include "warp_reduce.hpp"

 namespace cv { namespace gpu { namespace device 
--- a/modules/gpu/src/opencv2/gpu/device/filters.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/filters.hpp
@@ -43,7 +43,6 @@
 #ifndef __OPENCV_GPU_FILTERS_HPP__
 #define __OPENCV_GPU_FILTERS_HPP__

-#include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 #include "vec_traits.hpp"
 #include "vec_math.hpp"
--- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
@@ -45,7 +45,6 @@
 #define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_

 #include <cstdio>
-#include "internal_shared.hpp"

 namespace cv { namespace gpu { namespace device 
 {
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -44,7 +44,6 @@
 #define __OPENCV_GPU_FUNCTIONAL_HPP__

 #include <thrust/functional.h>
-#include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 #include "vec_traits.hpp"
 #include "type_traits.hpp"
--- a/modules/gpu/src/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
@@ -44,7 +44,7 @@
 #define __OPENCV_GPU_LIMITS_GPU_HPP__

 #include <limits>
-#include "internal_shared.hpp"
+#include "common.hpp"

 namespace cv { namespace gpu { namespace device 
 {
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_SATURATE_CAST_HPP__
 #define __OPENCV_GPU_SATURATE_CAST_HPP__

-#include "internal_shared.hpp"
+#include "common.hpp"

 namespace cv { namespace gpu { namespace device
 {
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_TRANSFORM_HPP__
 #define __OPENCV_GPU_TRANSFORM_HPP__

-#include "internal_shared.hpp"
+#include "common.hpp"
 #include "utility.hpp"
 #include "detail/transform_detail.hpp"

--- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
@@ -43,7 +43,6 @@
 #ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__
 #define __OPENCV_GPU_TYPE_TRAITS_HPP__

-#include "internal_shared.hpp"
 #include "detail/type_traits_detail.hpp"

 namespace cv { namespace gpu { namespace device 
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -43,7 +43,6 @@
 #ifndef __OPENCV_GPU_UTILITY_HPP__
 #define __OPENCV_GPU_UTILITY_HPP__

-#include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"
 #include "detail/utility_detail.hpp"
--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
@@ -43,7 +43,6 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_HPP__

-#include "internal_shared.hpp"
 #include "utility.hpp"
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@@ -43,7 +43,6 @@
 #ifndef __OPENCV_GPU_VECMATH_HPP__
 #define __OPENCV_GPU_VECMATH_HPP__

-#include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 #include "vec_traits.hpp"
 #include "functional.hpp"
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_VEC_TRAITS_HPP__
 #define __OPENCV_GPU_VEC_TRAITS_HPP__

-#include "internal_shared.hpp"
+#include "common.hpp"

 namespace cv { namespace gpu { namespace device 
 {
--- a/modules/gpu/src/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp
@@ -43,8 +43,6 @@
 #ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
 #define __OPENCV_GPU_DEVICE_WARP_HPP__

-#include "internal_shared.hpp"
-
 namespace cv { namespace gpu { namespace device 
 {
    struct Warp
--- a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
@@ -44,8 +44,6 @@
 #ifndef OPENCV_GPU_WARP_REDUCE_HPP__
 #define OPENCV_GPU_WARP_REDUCE_HPP__

-#include "internal_shared.hpp"
-
 namespace cv { namespace gpu { namespace device 
 {		
    template <class T>