moved GpuMat's operations implementation to core module

2011-11-14 14:34:36 +00:00
parent 0f53f2993e
commit 2695039a79
34 changed files with 825 additions and 606 deletions
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -3,4 +3,132 @@ if(ZLIB_FOUND)
 else()
    include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/zlib")
 endif()
-define_opencv_module(core ${ZLIB_LIBRARY})
+
+#define_opencv_module(core ${ZLIB_LIBRARY})
+
+set(name "core")
+
+project(opencv_${name})
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
+                    "${CMAKE_CURRENT_SOURCE_DIR}/src"
+                    "${CMAKE_CURRENT_BINARY_DIR}")
+
+file(GLOB lib_srcs "src/*.cpp")
+file(GLOB lib_int_hdrs "src/*.h*")
+file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
+file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.h*")
+
+if(COMMAND get_module_external_sources)
+   get_module_external_sources(${name})
+endif()
+
+source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
+source_group("Include" FILES ${lib_hdrs})
+source_group("Include\\detail" FILES ${lib_hdrs_detail})
+list(APPEND lib_hdrs ${lib_hdrs_detail})
+
+if (HAVE_CUDA)
+    file(GLOB lib_cuda "src/cuda/*.cu")
+    source_group("Cuda" FILES "${lib_cuda}")
+ 
+    include_directories(${CUDA_INCLUDE_DIRS})
+    include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/src")
+    include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/src/cuda")
+ 
+    set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode arch=compute_10,code=sm_10
+                                            -gencode arch=compute_11,code=sm_11
+                                            -gencode arch=compute_12,code=sm_12
+                                            -gencode arch=compute_13,code=sm_13
+                                            -gencode arch=compute_20,code=sm_20
+                                            -gencode arch=compute_20,code=sm_21)
+
+    if (UNIX OR APPLE)
+        set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-Xcompiler;-fPIC;")
+        #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
+    endif()
+
+    #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
+    #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
+    
+    if (APPLE)
+        set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fno-finite-math-only;")
+    endif()
+
+    CUDA_COMPILE(cuda_objs ${lib_cuda})
+    #CUDA_BUILD_CLEAN_TARGET()
+endif()
+
+set(the_target "opencv_${name}")
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${cuda_objs})
+
+# For dynamic link numbering convenions
+if(NOT ANDROID)
+    # Android SDK build scripts can include only .so files into final .apk
+    # As result we should not set version properties for Android
+    set_target_properties(${the_target} PROPERTIES
+        VERSION ${OPENCV_VERSION}
+        SOVERSION ${OPENCV_SOVERSION}
+        )
+endif()
+
+set_target_properties(${the_target} PROPERTIES OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}" )    
+
+if(ENABLE_SOLUTION_FOLDERS)
+    set_target_properties(${the_target} PROPERTIES FOLDER "modules")
+endif() 
+        
+if (BUILD_SHARED_LIBS)
+    if(MSVC)
+        set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
+    else()
+        add_definitions(-DCVAPI_EXPORTS)        
+    endif()
+endif()
+
+# Additional target properties
+set_target_properties(${the_target} PROPERTIES
+    DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+    ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
+    RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
+    INSTALL_NAME_DIR lib
+    )
+
+# Add the required libraries for linking:
+target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ZLIB_LIBRARY})
+
+if (HAVE_CUDA)
+    target_link_libraries(${the_target} ${CUDA_LIBRARIES})
+
+    unset(CUDA_npp_LIBRARY CACHE)
+    find_cuda_helper_libs(npp)
+    target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})
+endif()
+
+if(MSVC)
+    if(CMAKE_CROSSCOMPILING)
+        set_target_properties(${the_target} PROPERTIES
+            LINK_FLAGS "/NODEFAULTLIB:secchk"
+            )
+    endif()
+    set_target_properties(${the_target} PROPERTIES
+        LINK_FLAGS "/NODEFAULTLIB:libc /DEBUG"
+        )
+endif()
+
+# Dependencies of this target:
+add_dependencies(${the_target} ${ZLIB_LIBRARY})
+
+install(TARGETS ${the_target}
+    RUNTIME DESTINATION bin COMPONENT main
+    LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main
+    ARCHIVE DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main)
+
+install(FILES ${lib_hdrs}
+    DESTINATION ${OPENCV_INCLUDE_PREFIX}/opencv2/${name}
+    COMPONENT main)
+    
+add_opencv_precompiled_headers(${the_target})
+
+define_opencv_test(${name})
+define_opencv_perf_test(${name})
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -212,27 +212,9 @@ namespace cv { namespace gpu
    CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
    CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);

-    class CV_EXPORTS GpuFuncTable
-    {
-    public:
-        virtual ~GpuFuncTable() {}
+    //////////////////////////////// Error handling ////////////////////////

-        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
-        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
-        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
-
-        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
-
-        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
-        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
-
-        virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
-
-        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
-        virtual void free(void* devPtr) const = 0;
-    };
-
-    CV_EXPORTS void setGpuFuncTable(const GpuFuncTable* funcTbl);
+    CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);

    ////////////////////////////////////////////////////////////////////////

--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@@ -0,0 +1,345 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+
+namespace cv { namespace gpu { namespace device 
+{
+    template <typename T> struct shift_and_sizeof;
+    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
+    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
+    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
+    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
+    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
+    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
+    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
+
+    ///////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////// CopyTo /////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////
+
+    template<typename T>
+    __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if ((x < cols * channels ) && (y < rows))
+            if (mask[y * step_mask + x / channels] != 0)
+            {
+                size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
+                mat_dst[idx] = mat_src[idx];
+            }
+    }
+
+    template<typename T>
+    void copy_to_with_mask_run(DevMem2Db mat_src, DevMem2Db mat_dst, DevMem2Db mask, int channels, cudaStream_t stream)
+    {
+        dim3 threadsPerBlock(16,16, 1);
+        dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
+
+        copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
+                ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall ( cudaDeviceSynchronize() );
+    }
+
+    void copy_to_with_mask(DevMem2Db mat_src, DevMem2Db mat_dst, int depth, DevMem2Db mask, int channels, cudaStream_t stream)
+    {
+        typedef void (*CopyToFunc)(DevMem2Db mat_src, DevMem2Db mat_dst, DevMem2Db mask, int channels, cudaStream_t stream);
+
+        static CopyToFunc tab[8] =
+        {
+            copy_to_with_mask_run<unsigned char>,
+            copy_to_with_mask_run<signed char>,
+            copy_to_with_mask_run<unsigned short>,
+            copy_to_with_mask_run<short>,
+            copy_to_with_mask_run<int>,
+            copy_to_with_mask_run<float>,
+            copy_to_with_mask_run<double>,
+            0
+        };
+
+        CopyToFunc func = tab[depth];
+
+        if (func == 0) 
+            cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
+
+        func(mat_src, mat_dst, mask, channels, stream);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////// SetTo //////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////
+
+    __constant__ uchar scalar_8u[4];
+    __constant__ schar scalar_8s[4];
+    __constant__ ushort scalar_16u[4];
+    __constant__ short scalar_16s[4];
+    __constant__ int scalar_32s[4];
+    __constant__ float scalar_32f[4]; 
+    __constant__ double scalar_64f[4];
+
+    template <typename T> __device__ __forceinline__ T readScalar(int i);
+    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
+    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
+    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
+    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
+    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
+    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
+    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
+
+    void writeScalar(const uchar* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
+    }
+    void writeScalar(const schar* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
+    }
+    void writeScalar(const ushort* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
+    }
+    void writeScalar(const short* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
+    }
+    void writeScalar(const int* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
+    }
+    void writeScalar(const float* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
+    }
+    void writeScalar(const double* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
+    }
+
+    template<typename T>
+    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if ((x < cols * channels ) && (y < rows))
+        {
+            size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
+            mat[idx] = readScalar<T>(x % channels);
+        }
+    }
+
+    template<typename T>
+    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if ((x < cols * channels ) && (y < rows))
+            if (mask[y * step_mask + x / channels] != 0)
+            {
+                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
+                mat[idx] = readScalar<T>(x % channels);
+            }
+    }
+    template <typename T>
+    void set_to_gpu(DevMem2Db mat, const T* scalar, DevMem2Db mask, int channels, cudaStream_t stream)
+    {
+        writeScalar(scalar);
+
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+
+        set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall ( cudaDeviceSynchronize() );
+    }
+
+    template void set_to_gpu<uchar >(DevMem2Db mat, const uchar*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(DevMem2Db mat, const schar*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(DevMem2Db mat, const ushort* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(DevMem2Db mat, const short*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(DevMem2Db mat, const int*    scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(DevMem2Db mat, const float*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(DevMem2Db mat, const double* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+
+    template <typename T>
+    void set_to_gpu(DevMem2Db mat, const T* scalar, int channels, cudaStream_t stream)
+    {
+        writeScalar(scalar);
+
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+
+        set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall ( cudaDeviceSynchronize() );
+    }
+
+    template void set_to_gpu<uchar >(DevMem2Db mat, const uchar*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(DevMem2Db mat, const schar*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(DevMem2Db mat, const ushort* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(DevMem2Db mat, const short*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(DevMem2Db mat, const int*    scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(DevMem2Db mat, const float*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(DevMem2Db mat, const double* scalar, int channels, cudaStream_t stream);
+
+    ///////////////////////////////////////////////////////////////////////////
+    //////////////////////////////// ConvertTo ////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////
+
+    template <typename T, typename D> struct Convertor : unary_function<T, D>
+    {
+        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
+
+        __device__ __forceinline__ D operator()(const T& src) const
+        {
+            return saturate_cast<D>(alpha * src + beta);
+        }
+
+        const double alpha, beta;
+    };
+
+    namespace detail
+    {
+        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 8 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+
+        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 2 };
+        };
+
+        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 2 };
+        };
+
+        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
+        {
+        };
+    }
+
+    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    {
+    };
+        
+    template<typename T, typename D>
+    void cvt_(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream)
+    {
+        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
+        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
+        Convertor<T, D> op(alpha, beta);
+        ::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
+    }
+
+    void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream);
+
+        static const caller_t tab[8][8] =
+        {
+            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
+            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
+
+            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
+            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
+
+            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
+            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
+
+            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
+            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
+
+            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
+            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
+
+            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
+            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
+
+            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
+            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
+
+            {0,0,0,0,0,0,0,0}
+        };
+
+        caller_t func = tab[sdepth][ddepth];
+        if (!func)
+            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
+
+        func(src, dst, alpha, beta, stream);
+    }
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -43,6 +43,14 @@
 #include "precomp.hpp"
 #include "opencv2/core/gpumat.hpp"

+#include <iostream>
+#include <sstream>
+
+#ifdef HAVE_CUDA
+    #include <cuda_runtime.h>
+    #include <npp.h>
+#endif
+
 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
@@ -283,6 +291,31 @@ cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), re
    m.download(*this);
 }

+namespace
+{
+    class CV_EXPORTS GpuFuncTable
+    {
+    public:
+        virtual ~GpuFuncTable() {}
+
+        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
+        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
+        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
+
+        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
+
+        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
+        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
+
+        virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
+
+        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
+        virtual void free(void* devPtr) const = 0;
+    };
+}
+
+#ifndef HAVE_CUDA
+
 namespace
 {
    void throw_nogpu() 
@@ -308,20 +341,460 @@ namespace
        void free(void*) const {}
    };

-    const GpuFuncTable* g_funcTbl = 0;
-
    const GpuFuncTable* gpuFuncTable()
    {
        static EmptyFuncTable empty;
-        return g_funcTbl ? g_funcTbl : &empty;
+        return &empty;
    }
 }

-void cv::gpu::setGpuFuncTable(const GpuFuncTable* funcTbl)
+#else // HAVE_CUDA
+
+namespace cv { namespace gpu { namespace device 
 {
-    g_funcTbl = funcTbl;
+    void copy_to_with_mask(DevMem2Db src, DevMem2Db dst, int depth, DevMem2Db mask, int channels, cudaStream_t stream);
+
+    template <typename T>
+    void set_to_gpu(DevMem2Db mat, const T* scalar, int channels, cudaStream_t stream);
+
+    template <typename T>
+    void set_to_gpu(DevMem2Db mat, const T* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+
+    void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream);
+}}}
+
+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+
+    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+        {
+            std::ostringstream msg;
+            msg << "NPP API Call Error: " << err;
+            cv::gpu::error(msg.str().c_str(), file, line, func);
+        }
+    }
 }

+namespace
+{
+    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
+    {
+        Scalar_<T> sf = s;
+        ::cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
+    }
+
+    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    {
+        Scalar_<T> sf = s;
+        ::cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+    }
+}
+
+namespace cv { namespace gpu
+{
+    CV_EXPORTS void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) 
+    { 
+        ::cv::gpu::device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels(), stream);
+    }
+
+    CV_EXPORTS void convertTo(const GpuMat& src, GpuMat& dst)
+    {
+        ::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
+    }  
+
+    CV_EXPORTS void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
+    {
+        ::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
+    }
+
+    CV_EXPORTS void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
+
+        static const caller_t callers[] = 
+        {
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
+
+        callers[src.depth()](src, s, stream);
+    }
+
+    CV_EXPORTS void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+
+        static const caller_t callers[] = 
+        {
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
+
+        callers[src.depth()](src, s, mask, stream);
+    }
+
+    CV_EXPORTS void setTo(GpuMat& src, Scalar s)
+    {
+        setTo(src, s, 0);
+    }
+
+    CV_EXPORTS void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        setTo(src, s, mask, 0);
+    }
+}}
+
+namespace
+{
+    //////////////////////////////////////////////////////////////////////////
+    // Convert
+
+    template<int n> struct NPPTypeTraits;
+    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
+    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
+    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
+    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+
+    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
+    };
+    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
+    {
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
+    };
+
+    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        static void cvt(const GpuMat& src, GpuMat& dst)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
+    {
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        static void cvt(const GpuMat& src, GpuMat& dst)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };    
+
+    //////////////////////////////////////////////////////////////////////////
+    // Set
+    
+    template<int SDEPTH, int SCN> struct NppSetFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+
+    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, Scalar s)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Scalar_<src_t> nppS = s;
+
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, Scalar s)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Scalar_<src_t> nppS = s;
+
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };    
+
+    template<int SDEPTH, int SCN> struct NppSetMaskFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+
+    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, Scalar s, const GpuMat& mask)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Scalar_<src_t> nppS = s;
+
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, Scalar s, const GpuMat& mask)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Scalar_<src_t> nppS = s;
+
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };    
+
+    class CudaFuncTable : public GpuFuncTable
+    {
+    public:
+        void copy(const Mat& src, GpuMat& dst) const 
+        { 
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
+        }
+        void copy(const GpuMat& src, Mat& dst) const
+        { 
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
+        }
+        void copy(const GpuMat& src, GpuMat& dst) const
+        { 
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
+        }
+
+        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const 
+        { 
+            ::cv::gpu::copyWithMask(src, dst, mask);
+        }
+
+        void convert(const GpuMat& src, GpuMat& dst) const 
+        { 
+            typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);
+            static const caller_t callers[7][7][7] =
+            {
+                {                
+                    /*  8U ->  8U */ {0, 0, 0, 0},
+                    /*  8U ->  8S */ {::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo},
+                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
+                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
+                    /*  8U -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                },
+                {
+                    /*  8S ->  8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S ->  8S */ {0,0,0,0},
+                    /*  8S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                },
+                {
+                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
+                    /* 16U ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16U -> 16U */ {0,0,0,0},
+                    /* 16U -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                },
+                {
+                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
+                    /* 16S ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16S -> 16S */ {0,0,0,0},
+                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                },
+                {
+                    /* 32S ->  8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S -> 32S */ {0,0,0,0},
+                    /* 32S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                },
+                {
+                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F -> 32F */ {0,0,0,0},
+                    /* 32F -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                },
+                {
+                    /* 64F ->  8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F -> 64F */ {0,0,0,0}
+                }
+            };
+
+            caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];
+            CV_DbgAssert(func != 0);
+
+            func(src, dst);
+        }
+
+        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const 
+        { 
+            ::cv::gpu::convertTo(src, dst, alpha, beta);
+        }
+
+        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
+        {
+            NppiSize sz;
+            sz.width  = m.cols;
+            sz.height = m.rows;
+
+            if (mask.empty())
+            {
+                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
+                {
+                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
+                    return;
+                }
+
+                if (m.depth() == CV_8U)
+                {
+                    int cn = m.channels();
+
+                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
+                    {
+                        int val = saturate_cast<uchar>(s[0]);
+                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
+                        return;
+                    }
+                }
+
+                typedef void (*caller_t)(GpuMat& src, Scalar s);
+                static const caller_t callers[7][4] =
+                {
+                    {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
+                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo},
+                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set, NppSet<CV_16U, 2, nppiSet_16u_C2R>::set, ::cv::gpu::setTo, NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
+                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set, NppSet<CV_16S, 2, nppiSet_16s_C2R>::set, ::cv::gpu::setTo, NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
+                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
+                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
+                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}
+                };
+
+                callers[m.depth()][m.channels() - 1](m, s);
+            }
+            else
+            {
+                typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask);
+
+                static const caller_t callers[7][4] =
+                {
+                    {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
+                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo},
+                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
+                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
+                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
+                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
+                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}
+                };
+
+                callers[m.depth()][m.channels() - 1](m, s, mask);
+            }
+        }
+
+        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
+        {
+            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
+        }
+
+        void free(void* devPtr) const
+        {
+            cudaFree(devPtr);
+        }
+    };
+    
+    const GpuFuncTable* gpuFuncTable()
+    {
+        static CudaFuncTable funcTable;
+        return &funcTable;
+    }
+}
+
+#endif // HAVE_CUDA
+
 void cv::gpu::GpuMat::upload(const Mat& m)
 {
    CV_DbgAssert(!m.empty());
@@ -458,3 +931,19 @@ void cv::gpu::GpuMat::release()
    step = rows = cols = 0;
    refcount = 0;
 }
+
+void cv::gpu::error(const char *error_string, const char *file, const int line, const char *func)
+{
+    int code = CV_GpuApiCallError;
+
+    if (uncaught_exception())
+    {
+        const char* errorStr = cvErrorStr(code);            
+        const char* function = func ? func : "unknown function";    
+
+        cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
+        cerr.flush();            
+    }
+    else    
+        cv::error( cv::Exception(code, error_string, func, file, line) );
+}