From 8abdb3721fca6e9de98b36cd4c33d54bd78f486c Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 24 Jan 2011 10:11:02 +0000
Subject: [PATCH] added gpu threshold.

---
 doc/gpu_image_processing.tex                  |   8 +-
 modules/gpu/include/opencv2/gpu/gpu.hpp       |   7 +-
 modules/gpu/src/cuda/element_operations.cu    | 111 ++++-
 modules/gpu/src/cuda/mathfunc.cu              |   2 +-
 modules/gpu/src/cuda/matrix_operations.cu     | 176 ++-----
 modules/gpu/src/cuda/matrix_reductions.cu     |   2 +-
 modules/gpu/src/cuda/surf.cu                  |   1 -
 modules/gpu/src/cuda/surf_key_point.h         |  54 ---
 modules/gpu/src/cuda/transform.hpp            | 130 ------
 modules/gpu/src/cudastream.cpp                |   4 +-
 modules/gpu/src/element_operations.cpp        |  70 +++
 modules/gpu/src/imgproc_gpu.cpp               |  20 -
 modules/gpu/src/matrix_operations.cpp         |   6 +-
 .../gpu/src/opencv2/gpu/device/transform.hpp  | 433 ++++++++++++++++++
 .../gpu/src/opencv2/gpu/device/vecmath.hpp    |  80 +++-
 modules/gpu/src/precomp.hpp                   |   1 -
 tests/gpu/src/imgproc_gpu.cpp                 |  33 +-
 17 files changed, 768 insertions(+), 370 deletions(-)
 delete mode 100644 modules/gpu/src/cuda/surf_key_point.h
 delete mode 100644 modules/gpu/src/cuda/transform.hpp
 create mode 100644 modules/gpu/src/opencv2/gpu/device/transform.hpp

diff --git a/doc/gpu_image_processing.tex b/doc/gpu_image_processing.tex
index fe6a69ee8..d4d127077 100644
--- a/doc/gpu_image_processing.tex
+++ b/doc/gpu_image_processing.tex
@@ -314,13 +314,17 @@ See also: \cvCppCross{cvtColor}.
 Applies a fixed-level threshold to each array element.
 
 \cvdefCpp{
-double threshold(const GpuMat\& src, GpuMat\& dst, double thresh);
+double threshold(const GpuMat\& src, GpuMat\& dst, double thresh, \par double maxval, int type);\newline
+double threshold(const GpuMat\& src, GpuMat\& dst, double thresh, \par double maxval, int type, const Stream\& stream);
 }
 
 \begin{description}
-\cvarg{src}{Source array. Supports only \texttt{CV\_32FC1} type.}
+\cvarg{src}{Source array (single-channel, \texttt{CV\_64F} depth doesn't supported).}
 \cvarg{dst}{Destination array; will have the same size and the same type as \texttt{src}.}
 \cvarg{thresh}{Threshold value.}
+\cvarg{maxVal}{Maximum value to use with \texttt{THRESH\_BINARY} and \texttt{THRESH\_BINARY\_INV} thresholding types.}
+\cvarg{thresholdType}{Thresholding type. For details see \cvCppCross{threshold}. \texttt{THRESH\_OTSU} thresholding type doesn't supported.}
+\cvarg{stream}{Stream for the asynchronous version.}
 \end{description}
 
 See also: \cvCppCross{threshold}.
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index f566867da..769d2373e 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -587,9 +587,10 @@ namespace cv
         //! async version
         CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, const Stream& stream);
 
-        //! applies fixed threshold to the image.
-        //! Now supports only THRESH_TRUNC threshold type and one channels float source.
-        CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh);
+        //! applies fixed threshold to the image
+        CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type);
+        //! async version
+        CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, const Stream& stream);
 
         //! resizes the image
         //! Supports INTER_NEAREST, INTER_LINEAR
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index cc44c647e..4d205255c 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -41,7 +41,8 @@
 //M*/
 
 #include "opencv2/gpu/device/vecmath.hpp"
-#include "transform.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
 #include "internal_shared.hpp"
 
 using namespace cv::gpu;
@@ -468,4 +469,112 @@ namespace cv { namespace gpu { namespace mathfunc
     template void max_gpu<int   >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
     template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
     template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+
+    
+    //////////////////////////////////////////////////////////////////////////
+    // threshold
+
+    class ThreshOp
+    {
+    public:
+        ThreshOp(float thresh_, float maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+    protected:
+        float thresh;
+        float maxVal;
+    };
+
+    class ThreshBinary : public ThreshOp
+    {
+    public:
+        ThreshBinary(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
+
+        template<typename T>
+        __device__ T operator()(const T& src) const
+        {
+            return (float)src > thresh ? saturate_cast<T>(maxVal) : 0;
+        }
+    };
+
+    class ThreshBinaryInv : public ThreshOp
+    {
+    public:
+        ThreshBinaryInv(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
+
+        template<typename T>
+        __device__ T operator()(const T& src) const
+        {
+            return (float)src > thresh ? 0 : saturate_cast<T>(maxVal);
+        }
+    };
+
+    class ThreshTrunc : public ThreshOp
+    {
+    public:
+        ThreshTrunc(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
+
+        template<typename T>
+        __device__ T operator()(const T& src) const
+        {
+            return saturate_cast<T>(fmin((float)src, thresh));
+        }
+    };
+
+    class ThreshToZero : public ThreshOp
+    {
+    public:
+        ThreshToZero(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
+
+        template<typename T>
+        __device__ T operator()(const T& src) const
+        {
+            return (float)src > thresh ? src : 0;
+        }
+    };
+
+    class ThreshToZeroInv : public ThreshOp
+    {
+    public:
+        ThreshToZeroInv(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
+
+        template<typename T>
+        __device__ T operator()(const T& src) const
+        {
+            return (float)src > thresh ? 0 : src;
+        }
+    };
+
+    template <class Op, typename T>
+    void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, float thresh, float maxVal, 
+        cudaStream_t stream)
+    {
+        Op op(thresh, maxVal);
+        transform(src, dst, op, stream);
+    }
+
+    template <typename T>
+    void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type,
+        cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, float thresh, float maxVal, 
+            cudaStream_t stream);
+
+        static const caller_t callers[] = 
+        {
+            threshold_caller<ThreshBinary, T>, 
+            threshold_caller<ThreshBinaryInv, T>, 
+            threshold_caller<ThreshTrunc, T>, 
+            threshold_caller<ThreshToZero, T>, 
+            threshold_caller<ThreshToZeroInv, T>
+        };
+
+        callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);
+    }
+
+    template void threshold_gpu<uchar>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<schar>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<ushort>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<short>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<int>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
+    template void threshold_gpu<float>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
 }}}
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index cd7ee6fa7..fc686ee95 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -43,7 +43,7 @@
 #include "opencv2/gpu/device/limits_gpu.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vecmath.hpp"
-#include "transform.hpp"
+#include "opencv2/gpu/device/transform.hpp"
 #include "internal_shared.hpp"
 
 using namespace cv::gpu;
diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu
index 42e55b3f8..2ec794f65 100644
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -42,6 +42,7 @@
 
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/transform.hpp"
 
 using namespace cv::gpu::device;
 
@@ -55,63 +56,6 @@ namespace cv { namespace gpu { namespace matrix_operations {
     template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
     template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
     template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
-    
-    template <typename T, typename DT, size_t src_elem_size, size_t dst_elem_size>
-    struct ReadWriteTraits
-    {
-        enum {shift=1};
-
-        typedef T read_type;
-        typedef DT write_type;
-    };
-    template <typename T, typename DT>
-    struct ReadWriteTraits<T, DT, 1, 1>
-    {
-        enum {shift=4};
-
-        typedef char4 read_type;
-        typedef char4 write_type;
-    };
-    template <typename T, typename DT>
-    struct ReadWriteTraits<T, DT, 2, 1>
-    {
-        enum {shift=4};
-
-        typedef short4 read_type;
-        typedef char4 write_type;
-    };
-    template <typename T, typename DT>
-    struct ReadWriteTraits<T, DT, 4, 1>
-    {
-        enum {shift=4};
-
-        typedef int4 read_type;
-        typedef char4 write_type;
-    };
-    template <typename T, typename DT>
-    struct ReadWriteTraits<T, DT, 1, 2>
-    {
-        enum {shift=2};
-
-        typedef char2 read_type;
-        typedef short2 write_type;
-    };
-    template <typename T, typename DT>
-    struct ReadWriteTraits<T, DT, 2, 2>
-    {
-        enum {shift=2};
-
-        typedef short2 read_type;
-        typedef short2 write_type;
-    };
-    template <typename T, typename DT>
-    struct ReadWriteTraits<T, DT, 4, 2>
-    {
-        enum {shift=2};
-
-        typedef int2 read_type;
-        typedef short2 write_type;
-    };
 
 ///////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// CopyTo /////////////////////////////////
@@ -276,88 +220,64 @@ namespace cv { namespace gpu { namespace matrix_operations {
 //////////////////////////////// ConvertTo ////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////
 
-    template <typename T, typename DT>
-    __global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta)
+    template <typename T, typename D>
+    class Convertor
     {
-        typedef typename ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::read_type read_type;
-        typedef typename ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::write_type write_type;
-        const int shift = ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
+    public:
+        Convertor(double alpha_, double beta_): alpha(alpha_), beta(beta_) {}
 
-        const size_t x = threadIdx.x + blockIdx.x * blockDim.x;
-        const size_t y = threadIdx.y + blockIdx.y * blockDim.y;
-
-        if (y < height)
+        __device__ D operator()(const T& src)
         {
-            const T* src = (const T*)(srcmat + src_step * y);
-            DT* dst = (DT*)(dstmat + dst_step * y);
-            if ((x * shift) + shift - 1 < width)
-            {
-                read_type srcn_el = ((read_type*)src)[x];
-                write_type dstn_el;
-
-                const T* src1_el = (const T*) &srcn_el;
-                DT* dst1_el = (DT*) &dstn_el;
-
-                for (int i = 0; i < shift; ++i)
-                    dst1_el[i] =  saturate_cast<DT>(alpha * src1_el[i] + beta);
-
-                ((write_type*)dst)[x] = dstn_el;
-            }
-            else
-            {
-                for (int i = 0; i < shift - 1; ++i)
-                    if ((x * shift) + i < width)
-                        dst[(x * shift) + i] = saturate_cast<DT>(alpha * src[(x * shift) + i] + beta);
-            }
+            return saturate_cast<D>(alpha * src + beta);
         }
-    }    
 
-    typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
-
-    template<typename T, typename DT>
-    void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
+    private:
+        double alpha, beta;
+    };
+    
+    template<typename T, typename D>
+    void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
     {
-        const int shift = ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
-
-        dim3 block(32, 8);
-        dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
-
-        convert_to<T, DT><<<grid, block, 0, stream>>>(src.data, src.step, dst.data, dst.step, width, height, alpha, beta);
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        Convertor<T, D> op(alpha, beta);
+        transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
     }
 
-    void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream)
+    void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, 
+        cudaStream_t stream = 0)
     {
-        static CvtFunc tab[8][8] =
-        {
-            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
+        typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, 
+            cudaStream_t stream);
 
-            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
-
-            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
-
-            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
-
-            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
-
-            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
-
-            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
-
-            {0,0,0,0,0,0,0,0}
+        static const caller_t tab[8][8] =
+        {
+            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
+            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
+
+            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
+            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
+
+            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
+            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
+
+            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
+            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
+
+            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
+            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
+
+            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
+            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
+
+            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
+            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
+
+            {0,0,0,0,0,0,0,0}
         };
 
-        CvtFunc func = tab[sdepth][ddepth];
-        if (func == 0)
-            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
-        func(src, dst, src.cols * channels, src.rows, alpha, beta, stream);
+        caller_t func = tab[sdepth][ddepth];
+        if (!func)
+            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
+
+        func(src, dst, alpha, beta, stream);
     }
 }}}
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index c8d516a35..6dd7dabb5 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -43,7 +43,7 @@
 #include "opencv2/gpu/device/limits_gpu.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vecmath.hpp"
-#include "transform.hpp"
+#include "opencv2/gpu/device/transform.hpp"
 #include "internal_shared.hpp"
 
 using namespace cv::gpu;
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index a6aef8a35..00f62d475 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -46,7 +46,6 @@
 //M*/
 
 #include "internal_shared.hpp"
-#include "surf_key_point.h"
 #include "opencv2/gpu/device/limits_gpu.hpp"
 
 using namespace cv::gpu;
diff --git a/modules/gpu/src/cuda/surf_key_point.h b/modules/gpu/src/cuda/surf_key_point.h
deleted file mode 100644
index 31370042e..000000000
--- a/modules/gpu/src/cuda/surf_key_point.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_SURF_KEY_POINT_H__
-#define __OPENCV_SURF_KEY_POINT_H__
-
-namespace cv
-{
-    namespace gpu
-    {
-       
-    }
-}
-
-#endif // __OPENCV_SURF_KEY_POINT_H__
diff --git a/modules/gpu/src/cuda/transform.hpp b/modules/gpu/src/cuda/transform.hpp
deleted file mode 100644
index b8f066e6e..000000000
--- a/modules/gpu/src/cuda/transform.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_TRANSFORM_HPP__
-#define __OPENCV_GPU_TRANSFORM_HPP__
-
-#include "internal_shared.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    //! Mask accessor
-    template<class T> struct MaskReader_
-    {
-        PtrStep_<T> mask;
-        explicit MaskReader_(PtrStep_<T> mask): mask(mask) {}                
-
-        __device__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
-    };
-
-    //! Stub mask accessor
-    struct NoMask 
-    {
-        __device__ bool operator()(int y, int x) const { return true; } 
-    };
-
-    //! Transform kernels
-
-    template <typename T, typename D, typename UnOp, typename Mask>
-    static __global__ void transform(const DevMem2D_<T> src, PtrStep_<D> dst, const Mask mask, UnOp op)
-    {
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < src.cols && y < src.rows && mask(y, x))
-        {
-            T src_data = src.ptr(y)[x];
-            dst.ptr(y)[x] = op(src_data);
-        }
-    }
-
-    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    static __global__ void transform(const DevMem2D_<T1> src1, const PtrStep_<T2> src2, PtrStep_<D> dst, const Mask mask, BinOp op)
-    {
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < src1.cols && y < src1.rows && mask(y, x))
-        {
-            T1 src1_data = src1.ptr(y)[x];
-            T2 src2_data = src2.ptr(y)[x];
-            dst.ptr(y)[x] = op(src1_data, src2_data);
-        }
-    }  
-}}}
-
-namespace cv 
-{ 
-    namespace gpu 
-    {
-        template <typename T, typename D, typename UnOp>
-        static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream)
-        {
-            dim3 threads(16, 16, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(src.cols, threads.x);
-            grid.y = divUp(src.rows, threads.y);        
-
-            device::transform<T, D><<<grid, threads, 0, stream>>>(src, dst, device::NoMask(), op);
-
-            if (stream == 0)
-                cudaSafeCall( cudaThreadSynchronize() );
-        }
-        template <typename T1, typename T2, typename D, typename BinOp>
-        static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, cudaStream_t stream)
-        {
-            dim3 threads(16, 16, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(src1.cols, threads.x);
-            grid.y = divUp(src1.rows, threads.y);        
-
-            device::transform<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, device::NoMask(), op);
-
-            if (stream == 0)
-                cudaSafeCall( cudaThreadSynchronize() );            
-        }
-    }
-}
-
-#endif // __OPENCV_GPU_TRANSFORM_HPP__
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp
index 79abe4a65..4965f2527 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -80,7 +80,7 @@ namespace cv
             void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
             void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
 
-            void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
+            void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
         }
     }
 }
@@ -204,7 +204,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
         psrc = &(temp = src);
 
     dst.create( src.size(), rtype );
-    matrix_operations::convert_to(*psrc, sdepth, dst, ddepth, psrc->channels(), alpha, beta, impl->stream);
+    matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
 }
 
 
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 4d61cc353..2c88722f5 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -74,6 +74,8 @@ void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
 void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
 void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
+double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int) {throw_nogpu(); return 0.0;}
+double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, const Stream&) {throw_nogpu(); return 0.0;}
 
 #else
 
@@ -696,4 +698,72 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
     funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
 }
 
+////////////////////////////////////////////////////////////////////////
+// threshold
+
+namespace cv { namespace gpu { namespace mathfunc
+{
+    template <typename T>
+    void threshold_gpu(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type,
+        cudaStream_t stream);
+}}}
+
+namespace
+{
+    void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, 
+        cudaStream_t stream = 0)
+    {
+        using namespace cv::gpu::mathfunc;
+
+        typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type,
+            cudaStream_t stream);
+
+        static const caller_t callers[] = 
+        {
+            threshold_gpu<unsigned char>, threshold_gpu<signed char>, 
+            threshold_gpu<unsigned short>, threshold_gpu<short>, threshold_gpu<int>, threshold_gpu<float>, 0
+        };
+
+        CV_Assert(src.channels() == 1 && src.depth() < CV_64F);
+        CV_Assert(type <= THRESH_TOZERO_INV);
+
+        dst.create(src.size(), src.type());
+
+        if (src.depth() != CV_32F)
+        {
+            thresh = cvFloor(thresh);
+            maxVal = cvRound(maxVal);
+        }
+
+        callers[src.depth()](src, dst, static_cast<float>(thresh), static_cast<float>(maxVal), type, stream);
+    }
+}
+
+double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type)
+{
+    if (src.type() == CV_32FC1 && type == THRESH_TRUNC)
+    {
+        dst.create(src.size(), src.type());
+
+        NppiSize sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), src.step,
+            dst.ptr<Npp32f>(), dst.step, sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
+    }
+    else
+    {
+        threshold_caller(src, dst, thresh, maxVal, type);
+    }
+
+    return thresh;
+}
+
+double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, const Stream& stream)
+{
+    threshold_caller(src, dst, thresh, maxVal, type, StreamAccessor::getStream(stream));
+    return thresh;
+}
+
 #endif
\ No newline at end of file
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 3b866c945..693aa56ab 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -55,7 +55,6 @@ void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
 void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, const Stream&) { throw_nogpu(); }
 void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&) { throw_nogpu(); }
 void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, const Stream&) { throw_nogpu(); }
-double cv::gpu::threshold(const GpuMat&, GpuMat&, double) { throw_nogpu(); return 0.0; }
 void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int) { throw_nogpu(); }
 void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, const Scalar&) { throw_nogpu(); }
 void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); }
@@ -241,25 +240,6 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q,
     reprojectImageTo3D_callers[disp.type()](disp, xyzw, Q, StreamAccessor::getStream(stream));
 }
 
-////////////////////////////////////////////////////////////////////////
-// threshold
-
-double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh)
-{
-    CV_Assert(src.type() == CV_32FC1);
-
-    dst.create( src.size(), src.type() );
-
-    NppiSize sz;
-    sz.width  = src.cols;
-    sz.height = src.rows;
-
-    nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), src.step,
-        dst.ptr<Npp32f>(), dst.step, sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
-
-    return thresh;
-}
-
 ////////////////////////////////////////////////////////////////////////
 // resize
 
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp
index e2a88cc03..887f64adb 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -90,7 +90,7 @@ namespace cv
             void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
             void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
 
-            void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
+            void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
         }
     }
 }
@@ -193,7 +193,7 @@ namespace
 
     void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
     {
-        matrix_operations::convert_to(src, src.depth(), dst, dst.depth(), src.channels(), 1.0, 0.0);
+        matrix_operations::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
     }
 }
 
@@ -222,7 +222,7 @@ void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double be
     dst.create( size(), rtype );
 
     if (!noScale)
-        matrix_operations::convert_to(*psrc, sdepth, dst, ddepth, psrc->channels(), alpha, beta);
+        matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
     else
     {
         typedef void (*convert_caller_t)(const GpuMat& src, GpuMat& dst);
diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp
new file mode 100644
index 000000000..959cca235
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -0,0 +1,433 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_TRANSFORM_HPP__
+#define __OPENCV_GPU_TRANSFORM_HPP__
+
+#include "internal_shared.hpp"
+#include "vecmath.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    //! Mask accessor
+
+    class MaskReader
+    {
+    public:
+        explicit MaskReader(const PtrStep& mask_): mask(mask_) {}
+
+        __device__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
+
+    private:
+        PtrStep mask;
+    };
+
+    struct NoMask 
+    {
+        __device__ bool operator()(int y, int x) const { return true; } 
+    };
+
+    //! Read Write Traits
+
+    template <size_t src_elem_size, size_t dst_elem_size>
+    struct UnReadWriteTraits_
+    {
+        enum {shift=1};
+    };
+    template <size_t src_elem_size>
+    struct UnReadWriteTraits_<src_elem_size, 1>
+    {
+        enum {shift=4};
+    };
+    template <size_t src_elem_size>
+    struct UnReadWriteTraits_<src_elem_size, 2>
+    {
+        enum {shift=2};
+    };
+    template <typename T, typename D> struct UnReadWriteTraits
+    {
+        enum {shift=UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift};
+        
+        typedef typename TypeVec<T, shift>::vec_t read_type;
+        typedef typename TypeVec<D, shift>::vec_t write_type;
+    };
+
+    template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
+    struct BinReadWriteTraits_
+    {
+        enum {shift=1};
+    };
+    template <size_t src_elem_size1, size_t src_elem_size2>
+    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
+    {
+        enum {shift=4};
+    };
+    template <size_t src_elem_size1, size_t src_elem_size2>
+    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
+    {
+        enum {shift=2};
+    };
+    template <typename T1, typename T2, typename D> struct BinReadWriteTraits
+    {
+        enum {shift=BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
+
+        typedef typename TypeVec<T1, shift>::vec_t read_type1;
+        typedef typename TypeVec<T2, shift>::vec_t read_type2;
+        typedef typename TypeVec<D , shift>::vec_t write_type;
+    };
+
+    //! Transform kernels
+
+    template <int shift> struct OpUnroller;
+    template <> struct OpUnroller<1>
+    {
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+        }
+    };
+    template <> struct OpUnroller<2>
+    {
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src.y);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src1.y, src2.y);
+        }
+    };
+    template <> struct OpUnroller<3>
+    {
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src.z);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src1.y, src2.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src1.z, src2.z);
+        }
+    };
+    template <> struct OpUnroller<4>
+    {
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src.z);
+            if (mask(y, x_shifted + 3))
+                dst.w = op(src.w);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src1.y, src2.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src1.z, src2.z);
+            if (mask(y, x_shifted + 3))
+                dst.w = op(src1.w, src2.w);
+        }
+    };
+
+    template <typename T, typename D, typename UnOp, typename Mask>
+    __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, UnOp op)
+    {
+        typedef typename UnReadWriteTraits<T, D>::read_type read_type;
+        typedef typename UnReadWriteTraits<T, D>::write_type write_type;
+        const int shift = UnReadWriteTraits<T, D>::shift;
+
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;
+        const int x_shifted = x * shift;
+
+        if (y < src_.rows)
+        {
+            const T* src = src_.ptr(y);
+            D* dst = dst_.ptr(y);
+
+            if (x_shifted + shift - 1 < src_.cols)
+            {
+                read_type src_n_el = ((const read_type*)src)[x];
+                write_type dst_n_el;
+
+                OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                ((write_type*)dst)[x] = dst_n_el;
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src[real_x]);
+                }
+            }
+        }
+    }
+
+    template <typename T, typename D, typename UnOp, typename Mask>
+    static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep_<D> dst, const Mask mask, UnOp op)
+    {
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < src.cols && y < src.rows && mask(y, x))
+        {
+            dst.ptr(y)[x] = op(src.ptr(y)[x]);
+        }
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
+        const Mask mask, BinOp op)
+    {
+        typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
+        typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
+        typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
+        const int shift = BinReadWriteTraits<T1, T2, D>::shift;
+
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;
+        const int x_shifted = x * shift;
+
+        if (y < src1_.rows)
+        {
+            const T1* src1 = src1_.ptr(y);
+            const T2* src2 = src2_.ptr(y);
+            D* dst = dst_.ptr(y);
+
+            if (x_shifted + shift - 1 < src1_.cols)
+            {
+                read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                read_type2 src2_n_el = ((const read_type2*)src2)[x];
+                write_type dst_n_el;
+                
+                OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                ((write_type*)dst)[x] = dst_n_el;
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src1[real_x], src2[real_x]);
+                }
+            }
+        }
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep_<T2> src2, PtrStep_<D> dst, 
+        const Mask mask, BinOp op)
+    {
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < src1.cols && y < src1.rows && mask(y, x))
+        {
+            T1 src1_data = src1.ptr(y)[x];
+            T2 src2_data = src2.ptr(y)[x];
+            dst.ptr(y)[x] = op(src1_data, src2_data);
+        }
+    }  
+}}}
+
+namespace cv 
+{ 
+    namespace gpu 
+    {
+        template <bool UseSmart> struct TransformChooser;
+        template<> struct TransformChooser<false>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
+                             cudaStream_t stream = 0)
+            {
+                dim3 threads(16, 16, 1);
+                dim3 grid(1, 1, 1);
+
+                grid.x = divUp(src.cols, threads.x);
+                grid.y = divUp(src.rows, threads.y);        
+
+                device::transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+
+                if (stream == 0)
+                    cudaSafeCall( cudaThreadSynchronize() ); 
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
+                             BinOp op, const Mask& mask, cudaStream_t stream = 0)
+            {
+                dim3 threads(16, 16, 1);
+                dim3 grid(1, 1, 1);
+
+                grid.x = divUp(src1.cols, threads.x);
+                grid.y = divUp(src1.rows, threads.y);        
+
+                device::transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+
+                if (stream == 0)
+                    cudaSafeCall( cudaThreadSynchronize() );            
+            }
+        };
+        template<> struct TransformChooser<true>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
+                             cudaStream_t stream = 0)
+            {
+                const int shift = device::UnReadWriteTraits<T, D>::shift;
+
+                dim3 threads(16, 16, 1);
+                dim3 grid(1, 1, 1);            
+
+                grid.x = divUp(src.cols, threads.x * shift);
+                grid.y = divUp(src.rows, threads.y);        
+
+                device::transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+
+                if (stream == 0)
+                    cudaSafeCall( cudaThreadSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
+                             BinOp op, const Mask& mask, cudaStream_t stream = 0)
+            {
+                const int shift = device::BinReadWriteTraits<T1, T2, D>::shift;
+
+                dim3 threads(16, 16, 1);
+                dim3 grid(1, 1, 1);
+
+                grid.x = divUp(src1.cols, threads.x * shift);
+                grid.y = divUp(src1.rows, threads.y);        
+
+                device::transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+
+                if (stream == 0)
+                    cudaSafeCall( cudaThreadSynchronize() );            
+            }
+        };
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
+            cudaStream_t stream = 0)
+        {
+            TransformChooser<device::VecTraits<T>::cn == 1 && device::VecTraits<D>::cn == 1 && device::UnReadWriteTraits<T, D>::shift != 1>::call(src, dst, op, mask, stream);
+        }
+
+        template <typename T, typename D, typename UnOp>
+        static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream = 0)
+        {
+            transform_caller(src, dst, op, device::NoMask(), stream);
+        }
+        template <typename T, typename D, typename UnOp>
+        static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStep& mask, UnOp op, 
+            cudaStream_t stream = 0)
+        {
+            transform_caller(src, dst, op, device::MaskReader(mask), stream);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
+            BinOp op, const Mask& mask, cudaStream_t stream = 0)
+        {
+            TransformChooser<device::VecTraits<T1>::cn == 1 && device::VecTraits<T2>::cn == 1 && device::VecTraits<D>::cn == 1 && device::BinReadWriteTraits<T1, T2, D>::shift != 1>::call(src1, src2, dst, op, mask, stream);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp>
+        static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
+            BinOp op, cudaStream_t stream = 0)
+        {
+            transform_caller(src1, src2, dst, op, device::NoMask(), stream);
+        }
+        template <typename T1, typename T2, typename D, typename BinOp>
+        static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
+            const PtrStep& mask, BinOp op, cudaStream_t stream = 0)
+        {
+            transform_caller(src1, src2, dst, op, device::MaskReader(mask), stream);
+        }
+    }
+}
+
+#endif // __OPENCV_GPU_TRANSFORM_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
index dc04203e5..d34efe8bf 100644
--- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
@@ -64,12 +64,16 @@ namespace cv
             template<> struct TypeVec<uchar4, 4> { typedef uchar4 vec_t; };
 
             template<> struct TypeVec<char, 1> { typedef char vec_t; };
+            template<> struct TypeVec<schar, 1> { typedef char vec_t; };
             template<> struct TypeVec<char1, 1> { typedef char1 vec_t; };
             template<> struct TypeVec<char, 2> { typedef char2 vec_t; };
+            template<> struct TypeVec<schar, 2> { typedef char2 vec_t; };
             template<> struct TypeVec<char2, 2> { typedef char2 vec_t; };
             template<> struct TypeVec<char, 3> { typedef char3 vec_t; };
+            template<> struct TypeVec<schar, 3> { typedef char3 vec_t; };
             template<> struct TypeVec<char3, 3> { typedef char3 vec_t; };
             template<> struct TypeVec<char, 4> { typedef char4 vec_t; };
+            template<> struct TypeVec<schar, 4> { typedef char4 vec_t; };
             template<> struct TypeVec<char4, 4> { typedef char4 vec_t; };
 
             template<> struct TypeVec<ushort, 1> { typedef ushort vec_t; };
@@ -117,6 +121,15 @@ namespace cv
             template<> struct TypeVec<float, 4> { typedef float4 vec_t; };
             template<> struct TypeVec<float4, 4> { typedef float4 vec_t; };
 
+            template<> struct TypeVec<double, 1> { typedef double vec_t; };
+            template<> struct TypeVec<double1, 1> { typedef double1 vec_t; };
+            template<> struct TypeVec<double, 2> { typedef double2 vec_t; };
+            template<> struct TypeVec<double2, 2> { typedef double2 vec_t; };
+            template<> struct TypeVec<double, 3> { typedef double3 vec_t; };
+            template<> struct TypeVec<double3, 3> { typedef double3 vec_t; };
+            template<> struct TypeVec<double, 4> { typedef double4 vec_t; };
+            template<> struct TypeVec<double4, 4> { typedef double4 vec_t; };
+
             template<typename T> struct VecTraits;
 
             template<> struct VecTraits<uchar> 
@@ -162,33 +175,40 @@ namespace cv
                 static __device__ __host__ char all(char v) {return v;}
                 static __device__ __host__ char make(char x) {return x;}
             };
+            template<> struct VecTraits<schar> 
+            { 
+                typedef schar elem_t; 
+                enum {cn=1}; 
+                static __device__ __host__ schar all(schar v) {return v;}
+                static __device__ __host__ schar make(schar x) {return x;}
+            };
             template<> struct VecTraits<char1> 
             { 
-                typedef char elem_t; 
+                typedef schar elem_t; 
                 enum {cn=1}; 
-                static __device__ __host__ char1 all(char v) {return make_char1(v);}
-                static __device__ __host__ char1 make(char x) {return make_char1(x);}
+                static __device__ __host__ char1 all(schar v) {return make_char1(v);}
+                static __device__ __host__ char1 make(schar x) {return make_char1(x);}
             };
             template<> struct VecTraits<char2> 
             { 
-                typedef char elem_t; 
+                typedef schar elem_t; 
                 enum {cn=2}; 
-                static  __device__ __host__ char2 all(char v) {return make_char2(v, v);}
-                static  __device__ __host__ char2 make(char x, char y) {return make_char2(x, y);}
+                static  __device__ __host__ char2 all(schar v) {return make_char2(v, v);}
+                static  __device__ __host__ char2 make(schar x, schar y) {return make_char2(x, y);}
             };
             template<> struct VecTraits<char3> 
             { 
-                typedef char elem_t; 
+                typedef schar elem_t; 
                 enum {cn=3}; 
-                static __device__ __host__ char3 all(char v) {return make_char3(v, v, v);}
-                static __device__ __host__ char3 make(char x, char y, char z) {return make_char3(x, y, z);}
+                static __device__ __host__ char3 all(schar v) {return make_char3(v, v, v);}
+                static __device__ __host__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
             };
             template<> struct VecTraits<char4> 
             { 
-                typedef char elem_t; 
+                typedef schar elem_t; 
                 enum {cn=4}; 
-                static __device__ __host__ char4 all(char v) {return make_char4(v, v, v, v);}
-                static __device__ __host__ char4 make(char x, char y, char z, char w) {return make_char4(x, y, z, w);}
+                static __device__ __host__ char4 all(schar v) {return make_char4(v, v, v, v);}
+                static __device__ __host__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
             };
 
             template<> struct VecTraits<ushort> 
@@ -371,6 +391,42 @@ namespace cv
                 static __device__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);}
             };
 
+            template<> struct VecTraits<double> 
+            { 
+                typedef double elem_t; 
+                enum {cn=1}; 
+                static __device__ __host__ double all(double v) {return v;}
+                static __device__ __host__ double make(double x) {return x;}
+            };
+            template<> struct VecTraits<double1> 
+            { 
+                typedef double elem_t; 
+                enum {cn=1}; 
+                static __device__ __host__ double1 all(double v) {return make_double1(v);}
+                static __device__ __host__ double1 make(double x) {return make_double1(x);}
+            };
+            template<> struct VecTraits<double2> 
+            { 
+                typedef double elem_t; 
+                enum {cn=2}; 
+                static __device__ __host__ double2 all(double v) {return make_double2(v, v);}
+                static __device__ __host__ double2 make(double x, double y) {return make_double2(x, y);}
+            };
+            template<> struct VecTraits<double3> 
+            { 
+                typedef double elem_t; 
+                enum {cn=3}; 
+                static __device__ __host__ double3 all(double v) {return make_double3(v, v, v);}
+                static __device__ __host__ double3 make(double x, double y, double z) {return make_double3(x, y, z);}
+            };
+            template<> struct VecTraits<double4> 
+            { 
+                typedef double elem_t;
+                enum {cn=4}; 
+                static __device__ __host__ double4 all(double v) {return make_double4(v, v, v, v);}
+                static __device__ __host__ double4 make(double x, double y, double z, double w) {return make_double4(x, y, z, w);}
+            };
+
             template <int cn, typename VecD> struct SatCast;
             template <typename VecD> struct SatCast<1, VecD>
             {
diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp
index e0f2e004e..624e67fce 100644
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -70,7 +70,6 @@
     #include "opencv2/gpu/stream_accessor.hpp"
     #include "npp.h"    
     #include "npp_staging.h"
-    #include "surf_key_point.h"
 
 	#include "nvidia/NCV.hpp"
 	#include "nvidia/NCVHaarObjectDetection.hpp"
diff --git a/tests/gpu/src/imgproc_gpu.cpp b/tests/gpu/src/imgproc_gpu.cpp
index 7766d80da..d3affcf17 100644
--- a/tests/gpu/src/imgproc_gpu.cpp
+++ b/tests/gpu/src/imgproc_gpu.cpp
@@ -180,30 +180,41 @@ void CV_GpuImageProcTest::run( int )
 
 ////////////////////////////////////////////////////////////////////////////////
 // threshold
-struct CV_GpuNppImageThresholdTest : public CV_GpuImageProcTest
+struct CV_GpuImageThresholdTest : public CV_GpuImageProcTest
 {
 public:
-    CV_GpuNppImageThresholdTest() : CV_GpuImageProcTest( "GPU-NppImageThreshold", "threshold" ) {}
+    CV_GpuImageThresholdTest() : CV_GpuImageProcTest( "GPU-ImageThreshold", "threshold" ) {}
 
     int test(const Mat& img)
     {
-        if (img.type() != CV_32FC1)
+        if (img.type() != CV_8UC1 && img.type() != CV_32FC1)
         {
             ts->printf(CvTS::LOG, "\nUnsupported type\n");
             return CvTS::OK;
         }
 
+        const double maxVal = img.type() == CV_8UC1 ? 255 : 1.0;
+
         cv::RNG rng(*ts->get_rng());
-        const double thresh = rng;
 
-        cv::Mat cpuRes;
-        cv::threshold(img, cpuRes, thresh, 0.0, THRESH_TRUNC);
+        int res = CvTS::OK;
 
-        GpuMat gpu1(img);
-        GpuMat gpuRes;
-        cv::gpu::threshold(gpu1, gpuRes, thresh);
+        for (int type = THRESH_BINARY; type <= THRESH_TOZERO_INV; ++type)
+        {
+            const double thresh = rng.uniform(0.0, maxVal);
 
-        return CheckNorm(cpuRes, gpuRes);
+            cv::Mat cpuRes;
+            cv::threshold(img, cpuRes, thresh, maxVal, type);
+
+            GpuMat gpu1(img);
+            GpuMat gpuRes;
+            cv::gpu::threshold(gpu1, gpuRes, thresh, maxVal, type);
+
+            if (CheckNorm(cpuRes, gpuRes) != CvTS::OK)
+                res = CvTS::FAIL_GENERIC;
+        }
+
+        return res;
     }
 };
 
@@ -822,7 +833,7 @@ struct CV_GpuColumnSumTest: CvTest
 // Placing all test definitions in one place
 // makes us know about what tests are commented.
 
-CV_GpuNppImageThresholdTest CV_GpuNppImageThreshold_test;
+CV_GpuImageThresholdTest CV_GpuImageThreshold_test;
 CV_GpuNppImageResizeTest CV_GpuNppImageResize_test;
 CV_GpuNppImageCopyMakeBorderTest CV_GpuNppImageCopyMakeBorder_test;
 CV_GpuNppImageWarpAffineTest CV_GpuNppImageWarpAffine_test;