Merge branch '2.4'

2013-03-21 20:59:18 +04:00
parent bbae05413d b6365699ee
commit 1ca8f33b4e
276 changed files with 11834 additions and 5170 deletions
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -318,40 +318,14 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)

 void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
 {
-    class LevelsInit
-    {
-    public:
-        Npp32s pLevels[256];
-        const Npp32s* pLevels3[3];
-        int nValues3[3];
+    const int cn = src.channels();

-#if (CUDA_VERSION > 4020)
-        GpuMat d_pLevels;
-#endif
+    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+    CV_Assert( lut.depth() == CV_8U );
+    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
+    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );

-        LevelsInit()
-        {
-            nValues3[0] = nValues3[1] = nValues3[2] = 256;
-            for (int i = 0; i < 256; ++i)
-                pLevels[i] = i;
-
-
-#if (CUDA_VERSION <= 4020)
-            pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
-#else
-            d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
-            pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
-#endif
-        }
-    };
-    static LevelsInit lvls;
-
-    int cn = src.channels();
-
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3);
-    CV_Assert(lut.depth() == CV_8U && (lut.channels() == 1 || lut.channels() == cn) && lut.rows * lut.cols == 256 && lut.isContinuous());
-
-    dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));
+    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));

    NppiSize sz;
    sz.height = src.rows;
@@ -360,19 +334,34 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
    Mat nppLut;
    lut.convertTo(nppLut, CV_32S);

-    cudaStream_t stream = StreamAccessor::getStream(s);
+    int nValues3[] = {256, 256, 256};

+    Npp32s pLevels[256];
+    for (int i = 0; i < 256; ++i)
+        pLevels[i] = i;
+
+    const Npp32s* pLevels3[3];
+
+#if (CUDA_VERSION <= 4020)
+    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
+#else
+    GpuMat d_pLevels;
+    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
+    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
+#endif
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);

    if (src.type() == CV_8UC1)
    {
 #if (CUDA_VERSION <= 4020)
        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
 #else
        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), lvls.d_pLevels.ptr<Npp32s>(), 256) );
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
 #endif
    }
    else
@@ -409,7 +398,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
        }

        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
    }

    if (stream == 0)
--- a/modules/gpu/src/bgfg_vibe.cpp
+++ b/modules/gpu/src/bgfg_vibe.cpp
@@ -1,137 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long) { throw_nogpu(); }
-void cv::gpu::VIBE_GPU::initialize(const GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::VIBE_GPU::operator()(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::VIBE_GPU::release() {}
-
-#else
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace vibe
-    {
-        void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
-
-        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
-
-        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    const int defaultNbSamples = 20;
-    const int defaultReqMatches = 2;
-    const int defaultRadius = 20;
-    const int defaultSubsamplingFactor = 16;
-}
-
-cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed) :
-    frameSize_(0, 0), rngSeed_(rngSeed)
-{
-    nbSamples = defaultNbSamples;
-    reqMatches = defaultReqMatches;
-    radius = defaultRadius;
-    subsamplingFactor = defaultSubsamplingFactor;
-}
-
-void cv::gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& s)
-{
-    using namespace cv::gpu::device::vibe;
-
-    CV_Assert(firstFrame.type() == CV_8UC1 || firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    loadConstants(nbSamples, reqMatches, radius, subsamplingFactor);
-
-    frameSize_ = firstFrame.size();
-
-    if (randStates_.size() != frameSize_)
-    {
-        cv::RNG rng(rngSeed_);
-        cv::Mat h_randStates(frameSize_, CV_8UC4);
-        rng.fill(h_randStates, cv::RNG::UNIFORM, 0, 255);
-        randStates_.upload(h_randStates);
-    }
-
-    int ch = firstFrame.channels();
-    int sample_ch = ch == 1 ? 1 : 4;
-
-    samples_.create(nbSamples * frameSize_.height, frameSize_.width, CV_8UC(sample_ch));
-
-    init_gpu(firstFrame, ch, samples_, randStates_, stream);
-}
-
-void cv::gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& s)
-{
-    using namespace cv::gpu::device::vibe;
-
-    CV_Assert(frame.depth() == CV_8U);
-
-    int ch = frame.channels();
-    int sample_ch = ch == 1 ? 1 : 4;
-
-    if (frame.size() != frameSize_ || sample_ch != samples_.channels())
-        initialize(frame);
-
-    fgmask.create(frameSize_, CV_8UC1);
-
-    update_gpu(frame, ch, fgmask, samples_, randStates_, StreamAccessor::getStream(s));
-}
-
-void cv::gpu::VIBE_GPU::release()
-{
-    frameSize_ = Size(0, 0);
-
-    randStates_.release();
-
-    samples_.release();
-}
-
-#endif
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -48,6 +48,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

 void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
+void cv::gpu::demosaicing(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
 void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); }
 void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }

@@ -62,6 +63,9 @@ namespace cv { namespace gpu {
        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
        template <int cn>
        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+        template <int cn>
+        void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
    }
 }}

@@ -1620,26 +1624,56 @@ namespace

        funcs[src.depth()][dcn - 1](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
    }
-
    void bayerBG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
        bayer_to_bgr(src, dst, dcn, false, false, stream);
    }
-
    void bayerGB_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
        bayer_to_bgr(src, dst, dcn, false, true, stream);
    }
-
    void bayerRG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
        bayer_to_bgr(src, dst, dcn, true, false, stream);
    }
-
    void bayerGR_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
    {
        bayer_to_bgr(src, dst, dcn, true, true, stream);
    }
+
+    void bayer_to_gray(const GpuMat& src, GpuMat& dst, bool blue_last, bool start_with_green, Stream& stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        static const func_t funcs[3] =
+        {
+            Bayer2BGR_8u_gpu<1>,
+            0,
+            Bayer2BGR_16u_gpu<1>,
+        };
+
+        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1);
+        CV_Assert(src.rows > 2 && src.cols > 2);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
+
+        funcs[src.depth()](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
+    }
+    void bayerBG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, false, false, stream);
+    }
+    void bayerGB_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, false, true, stream);
+    }
+    void bayerRG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, true, false, stream);
+    }
+    void bayerGR_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, true, true, stream);
+    }
 }

 void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
@@ -1756,10 +1790,10 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
        yuv_to_bgr,             // CV_YUV2BGR      = 84
        yuv_to_rgb,             // CV_YUV2RGB      = 85

-        0,                      // CV_BayerBG2GRAY = 86
-        0,                      // CV_BayerGB2GRAY = 87
-        0,                      // CV_BayerRG2GRAY = 88
-        0,                      // CV_BayerGR2GRAY = 89
+        bayerBG_to_gray,        // CV_BayerBG2GRAY = 86
+        bayerGB_to_gray,        // CV_BayerGB2GRAY = 87
+        bayerRG_to_gray,        // CV_BayerRG2GRAY = 88
+        bayerGR_to_gray,        // CV_BayerGR2GRAY = 89

        //YUV 4:2:0 formats family
        0,                      // CV_YUV2RGB_NV12 = 90,
@@ -1825,6 +1859,74 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
    func(src, dst, dcn, stream);
 }

+void cv::gpu::demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
+{
+    const int depth = src.depth();
+
+    CV_Assert( src.channels() == 1 );
+
+    switch (code)
+    {
+    case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
+        bayer_to_gray(src, dst, code == CV_BayerBG2GRAY || code == CV_BayerGB2GRAY, code == CV_BayerGB2GRAY || code == CV_BayerGR2GRAY, stream);
+        break;
+
+    case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
+        bayer_to_bgr(src, dst, dcn, code == CV_BayerBG2BGR || code == CV_BayerGB2BGR, code == CV_BayerGB2BGR || code == CV_BayerGR2BGR, stream);
+        break;
+
+    case COLOR_BayerBG2BGR_MHT: case COLOR_BayerGB2BGR_MHT: case COLOR_BayerRG2BGR_MHT: case COLOR_BayerGR2BGR_MHT:
+    {
+        if (dcn <= 0)
+            dcn = 3;
+
+        CV_Assert( depth == CV_8U );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        dst.create(src.size(), CV_MAKETYPE(depth, dcn));
+        dst.setTo(Scalar::all(0));
+
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
+                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
+
+        if (dcn == 3)
+            device::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+        else
+            device::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+
+        break;
+    }
+
+    case COLOR_BayerBG2GRAY_MHT: case COLOR_BayerGB2GRAY_MHT: case COLOR_BayerRG2GRAY_MHT: case COLOR_BayerGR2GRAY_MHT:
+    {
+        CV_Assert( depth == CV_8U );
+
+        dst.create(src.size(), CV_MAKETYPE(depth, 1));
+        dst.setTo(Scalar::all(0));
+
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
+                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
+
+        device::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+
+        break;
+    }
+
+    default:
+        CV_Error( CV_StsBadFlag, "Unknown / unsupported color conversion code" );
+    }
+}
+
 void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s)
 {
    CV_Assert(image.type() == CV_8UC4);
--- a/modules/gpu/src/cuda/bgfg_vibe.cu
+++ b/modules/gpu/src/cuda/bgfg_vibe.cu
@@ -1,258 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace vibe
-    {
-        __constant__ int c_nbSamples;
-        __constant__ int c_reqMatches;
-        __constant__ int c_radius;
-        __constant__ int c_subsamplingFactor;
-
-        void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_nbSamples, &nbSamples, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_reqMatches, &reqMatches, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_radius, &radius, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_subsamplingFactor, &subsamplingFactor, sizeof(int)) );
-        }
-
-        __device__ __forceinline__ uint nextRand(uint& state)
-        {
-            const unsigned int CV_RNG_COEFF = 4164903690U;
-            state = state * CV_RNG_COEFF + (state >> 16);
-            return state;
-        }
-
-        __constant__ int c_xoff[9] = {-1,  0,  1, -1, 1, -1, 0, 1, 0};
-        __constant__ int c_yoff[9] = {-1, -1, -1,  0, 0,  1, 1, 1, 0};
-
-        __device__ __forceinline__ int2 chooseRandomNeighbor(int x, int y, uint& randState, int count = 8)
-        {
-            int idx = nextRand(randState) % count;
-
-            return make_int2(x + c_xoff[idx], y + c_yoff[idx]);
-        }
-
-        __device__ __forceinline__ uchar cvt(uchar val)
-        {
-            return val;
-        }
-        __device__ __forceinline__ uchar4 cvt(const uchar3& val)
-        {
-            return make_uchar4(val.x, val.y, val.z, 0);
-        }
-        __device__ __forceinline__ uchar4 cvt(const uchar4& val)
-        {
-            return val;
-        }
-
-        template <typename SrcT, typename SampleT>
-        __global__ void init(const PtrStepSz<SrcT> frame, PtrStep<SampleT> samples, PtrStep<uint> randStates)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            uint localState = randStates(y, x);
-
-            for (int k = 0; k < c_nbSamples; ++k)
-            {
-                int2 np = chooseRandomNeighbor(x, y, localState, 9);
-
-                np.x = ::max(0, ::min(np.x, frame.cols - 1));
-                np.y = ::max(0, ::min(np.y, frame.rows - 1));
-
-                SrcT pix = frame(np.y, np.x);
-
-                samples(k * frame.rows + y, x) = cvt(pix);
-            }
-
-            randStates(y, x) = localState;
-        }
-
-        template <typename SrcT, typename SampleT>
-        void init_caller(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(init<SrcT, SampleT>, cudaFuncCachePreferL1) );
-
-            init<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, (PtrStepSz<SampleT>) samples, randStates);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
-            static const func_t funcs[] =
-            {
-                0, init_caller<uchar, uchar>, 0, init_caller<uchar3, uchar4>, init_caller<uchar4, uchar4>
-            };
-
-            funcs[cn](frame, samples, randStates, stream);
-        }
-
-        __device__ __forceinline__ int calcDist(uchar a, uchar b)
-        {
-            return ::abs(a - b);
-        }
-        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar4& b)
-        {
-            return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
-        }
-        __device__ __forceinline__ int calcDist(const uchar4& a, const uchar4& b)
-        {
-            return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
-        }
-
-        template <typename SrcT, typename SampleT>
-        __global__ void update(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStep<SampleT> samples, PtrStep<uint> randStates)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            uint localState = randStates(y, x);
-
-            SrcT imgPix = frame(y, x);
-
-            // comparison with the model
-
-            int count = 0;
-            for (int k = 0; (count < c_reqMatches) && (k < c_nbSamples); ++k)
-            {
-                SampleT samplePix = samples(k * frame.rows + y, x);
-
-                int distance = calcDist(imgPix, samplePix);
-
-                if (distance < c_radius)
-                    ++count;
-            }
-
-            // pixel classification according to reqMatches
-
-            fgmask(y, x) = (uchar) (-(count < c_reqMatches));
-
-            if (count >= c_reqMatches)
-            {
-                // the pixel belongs to the background
-
-                // gets a random number between 0 and subsamplingFactor-1
-                int randomNumber = nextRand(localState) % c_subsamplingFactor;
-
-                // update of the current pixel model
-                if (randomNumber == 0)
-                {
-                    // random subsampling
-
-                    int k = nextRand(localState) % c_nbSamples;
-
-                    samples(k * frame.rows + y, x) = cvt(imgPix);
-                }
-
-                // update of a neighboring pixel model
-                randomNumber = nextRand(localState) % c_subsamplingFactor;
-
-                if (randomNumber == 0)
-                {
-                    // random subsampling
-
-                    // chooses a neighboring pixel randomly
-                    int2 np = chooseRandomNeighbor(x, y, localState);
-
-                    np.x = ::max(0, ::min(np.x, frame.cols - 1));
-                    np.y = ::max(0, ::min(np.y, frame.rows - 1));
-
-                    // chooses the value to be replaced randomly
-                    int k = nextRand(localState) % c_nbSamples;
-
-                    samples(k * frame.rows + np.y, np.x) = cvt(imgPix);
-                }
-            }
-
-            randStates(y, x) = localState;
-        }
-
-        template <typename SrcT, typename SampleT>
-        void update_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT, SampleT>, cudaFuncCachePreferL1) );
-
-            update<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, (PtrStepSz<SampleT>) samples, randStates);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
-            static const func_t funcs[] =
-            {
-                0, update_caller<uchar, uchar>, 0, update_caller<uchar3, uchar4>, update_caller<uchar4, uchar4>
-            };
-
-            funcs[cn](frame, fgmask, samples, randStates, stream);
-        }
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/debayer.cu
+++ b/modules/gpu/src/cuda/debayer.cu
@@ -42,42 +42,38 @@

 #if !defined CUDA_DISABLER

-#include <opencv2/gpu/device/common.hpp>
-#include <opencv2/gpu/device/vec_traits.hpp>
-#include <opencv2/gpu/device/vec_math.hpp>
-#include <opencv2/gpu/device/limits.hpp>
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/color.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"

-namespace cv { namespace gpu {
-    namespace device
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct Bayer2BGR;
+
+    template <> struct Bayer2BGR<uchar>
    {
-        template <typename D>
-        __global__ void Bayer2BGR_8u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
+        uchar3 res0;
+        uchar3 res1;
+        uchar3 res2;
+        uchar3 res3;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
        {
-            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (s_y >= dst.rows || (s_x << 2) >= dst.cols)
-                return;
-
-            s_y = ::min(::max(s_y, 1), dst.rows - 2);
-
            uchar4 patch[3][3];
            patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
            patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
+            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];

            patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
            patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
+            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];

            patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
            patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
-
-            D res0 = VecTraits<D>::all(numeric_limits<uchar>::max());
-            D res1 = VecTraits<D>::all(numeric_limits<uchar>::max());
-            D res2 = VecTraits<D>::all(numeric_limits<uchar>::max());
-            D res3 = VecTraits<D>::all(numeric_limits<uchar>::max());
+            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];

            if ((s_y & 1) ^ start_with_green)
            {
@@ -181,45 +177,69 @@ namespace cv { namespace gpu {
                    res3.z = t7;
                }
            }
-
-            const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-            const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            dst(d_y, d_x) = res0;
-            if (d_x + 1 < dst.cols)
-                dst(d_y, d_x + 1) = res1;
-            if (d_x + 2 < dst.cols)
-                dst(d_y, d_x + 2) = res2;
-            if (d_x + 3 < dst.cols)
-                dst(d_y, d_x + 3) = res3;
        }
+    };

-        template <typename D>
-        __global__ void Bayer2BGR_16u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
+    template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
+    template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
+    {
+        typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
+    {
+        return make_uchar4(pix.x, pix.y, pix.z, 255);
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 2) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<uchar> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+        if (d_x + 2 < src.cols)
+            dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
+        if (d_x + 3 < src.cols)
+            dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
+    }
+
+    template <> struct Bayer2BGR<ushort>
+    {
+        ushort3 res0;
+        ushort3 res1;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
        {
-            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (s_y >= dst.rows || (s_x << 1) >= dst.cols)
-                return;
-
-            s_y = ::min(::max(s_y, 1), dst.rows - 2);
-
            ushort2 patch[3][3];
            patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
            patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
+            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];

            patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
            patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
+            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];

            patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
            patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
-
-            D res0 = VecTraits<D>::all(numeric_limits<ushort>::max());
-            D res1 = VecTraits<D>::all(numeric_limits<ushort>::max());
+            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];

            if ((s_y & 1) ^ start_with_green)
            {
@@ -279,53 +299,246 @@ namespace cv { namespace gpu {
                    res1.z = t3;
                }
            }
-
-            const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
-            const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            dst(d_y, d_x) = res0;
-            if (d_x + 1 < dst.cols)
-                dst(d_y, d_x + 1) = res1;
        }
+    };

-        template <int cn>
-        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-        {
-            typedef typename TypeVec<uchar, cn>::vec_type dst_t;
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(dst.cols, 4 * block.x), divUp(dst.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
-
-            Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <int cn>
-        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-        {
-            typedef typename TypeVec<ushort, cn>::vec_type dst_t;
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(dst.cols, 2 * block.x), divUp(dst.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
-
-            Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
+    template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
+    {
+        typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
+    {
+        return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
    }
-}}

-#endif /* CUDA_DISABLER */
+    template <typename D>
+    __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 1) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<ushort> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+    }
+
+    template <int cn>
+    void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <int cn>
+    void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<ushort, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////
+    // Bayer Demosaicing (Malvar, He, and Cutler)
+    //
+    // by Morgan McGuire, Williams College
+    // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
+    //
+    // ported to CUDA
+
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    template <typename DstType>
+    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
+    {
+        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
+        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
+        const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/;
+        const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/;
+        const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ;
+        const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
+            return;
+
+        int2 center;
+        center.x = x + sourceOffset.x;
+        center.y = y + sourceOffset.y;
+
+        int4 xCoord;
+        xCoord.x = center.x - 2;
+        xCoord.y = center.x - 1;
+        xCoord.z = center.x + 1;
+        xCoord.w = center.x + 2;
+
+        int4 yCoord;
+        yCoord.x = center.y - 2;
+        yCoord.y = center.y - 1;
+        yCoord.z = center.y + 1;
+        yCoord.w = center.y + 2;
+
+        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
+
+        float4 Dvec;
+        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
+        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
+        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
+        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
+
+        float4 value;
+        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
+        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
+        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
+        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
+
+        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
+        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
+        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
+        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
+        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
+
+        float4 PATTERN;
+        PATTERN.x = kCx * C;
+        PATTERN.y = kCy * C;
+        PATTERN.z = kCz * C;
+        PATTERN.w = PATTERN.z;
+
+        float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
+
+        // There are five filter patterns (identity, cross, checker,
+        // theta, phi). Precompute the terms from all of them and then
+        // use swizzles to assign to color channels.
+        //
+        // Channel Matches
+        // x cross (e.g., EE G)
+        // y checker (e.g., EE B)
+        // z theta (e.g., EO R)
+        // w phi (e.g., EO B)
+
+        #define A value.x  // A0 + A1
+        #define B value.y  // B0 + B1
+        #define E value.z  // E0 + E1
+        #define F value.w  // F0 + F1
+
+        float3 temp;
+
+        // PATTERN.yzw += (kD.yz * D).xyy;
+        temp.x = kDy * D;
+        temp.y = kDz * D;
+        PATTERN.y += temp.x;
+        PATTERN.z += temp.y;
+        PATTERN.w += temp.y;
+
+        // PATTERN += (kA.xyz * A).xyzx;
+        temp.x = kAx * A;
+        temp.y = kAy * A;
+        temp.z = kAz * A;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.z;
+        PATTERN.w += temp.x;
+
+        // PATTERN += (kE.xyw * E).xyxz;
+        temp.x = kEx * E;
+        temp.y = kEy * E;
+        temp.z = kEw * E;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.x;
+        PATTERN.w += temp.z;
+
+        // PATTERN.xw += kB.xw * B;
+        PATTERN.x += kBx * B;
+        PATTERN.w += kBw * B;
+
+        // PATTERN.xz += kF.xz * F;
+        PATTERN.x += kFx * F;
+        PATTERN.z += kFz * F;
+
+        // Determine which of four types of pixels we are on.
+        int2 alternate;
+        alternate.x = (x + firstRed.x) % 2;
+        alternate.y = (y + firstRed.y) % 2;
+
+        // in BGR sequence;
+        uchar3 pixelColor =
+            (alternate.y == 0) ?
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
+                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
+                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
+
+        dst(y, x) = toDst<DstType>(pixelColor);
+    }
+
+    template <int cn>
+    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        bindTexture(&sourceTex, src);
+
+        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -48,6 +48,7 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/simd_functions.hpp"

 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -154,170 +155,28 @@ namespace arithm

 namespace arithm
 {
-    template <typename T, typename D> struct VAdd4;
-    template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint>
+    struct VAdd4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vadd4(a, b);
        }

        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {}
-    };
-    template <> struct VAdd4<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {}
-    };
-    template <> struct VAdd4<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {}
-    };
-    template <> struct VAdd4<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {}
+        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct VAdd2;
-    template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint>
+    struct VAdd2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vadd2(a, b);
        }

        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {}
-    };
-    template <> struct VAdd2<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {}
-    };
-    template <> struct VAdd2<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {}
-    };
-    template <> struct VAdd2<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {}
+        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
    };

    ////////////////////////////////////
@@ -336,13 +195,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -355,28 +214,16 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T, typename D>
-    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
    }

-    template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
    }

-    template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template <typename T, typename D>
    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
@@ -543,170 +390,28 @@ namespace arithm

 namespace arithm
 {
-    template <typename T, typename D> struct VSub4;
-    template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint>
+    struct VSub4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vsub4(a, b);
        }

        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {}
-    };
-    template <> struct VSub4<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {}
-    };
-    template <> struct VSub4<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {}
-    };
-    template <> struct VSub4<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<int, int>& other) {}
+        __device__ __forceinline__ VSub4(const VSub4& other) {}
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct VSub2;
-    template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint>
+    struct VSub2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vsub2(a, b);
        }

        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {}
-    };
-    template <> struct VSub2<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {}
-    };
-    template <> struct VSub2<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {}
-    };
-    template <> struct VSub2<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<int, int>& other) {}
+        __device__ __forceinline__ VSub2(const VSub2& other) {}
    };

    ////////////////////////////////////
@@ -725,13 +430,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -744,28 +449,16 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T, typename D>
-    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
    }

-    template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
    }

-    template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template <typename T, typename D>
    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
@@ -1496,90 +1189,28 @@ namespace arithm

 namespace arithm
 {
-    template <typename T, typename D> struct VAbsDiff4;
-    template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint>
+    struct VAbsDiff4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vabsdiff4(a, b);
        }

        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {}
-    };
-    template <> struct VAbsDiff4<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {}
+        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct VAbsDiff2;
-    template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint>
+    struct VAbsDiff2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vabsdiff2(a, b);
        }

        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {}
-    };
-    template <> struct VAbsDiff2<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {}
+        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
    };

    ////////////////////////////////////
@@ -1611,13 +1242,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -1630,24 +1261,16 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T>
-    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
    }

-    template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
    }

-    template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template <typename T>
    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
    {
@@ -1877,6 +1500,49 @@ namespace arithm

 namespace arithm
 {
+    struct VCmpEq4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpeq4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpEq4() {}
+        __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
+    };
+    struct VCmpNe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpne4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpNe4() {}
+        __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
+    };
+    struct VCmpLt4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmplt4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpLt4() {}
+        __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
+    };
+    struct VCmpLe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmple4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpLe4() {}
+        __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
+    };
+
+    ////////////////////////////////////
+
    template <class Op, typename T>
    struct Cmp : binary_function<T, T, uchar>
    {
@@ -1890,6 +1556,21 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
+    template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    ////////////////////////////////////
+
    template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
    {
    };
@@ -1897,6 +1578,23 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
+    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
+    }
+    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
+    }
+    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
+    }
+    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
+    }
+
    template <template <typename> class Op, typename T>
    void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
    {
@@ -2303,44 +2001,11 @@ namespace arithm

 namespace arithm
 {
-    template <typename T> struct VMin4;
-    template <> struct VMin4<uint> : binary_function<uint, uint, uint>
+    struct VMin4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMin4() {}
-        __device__ __forceinline__ VMin4(const VMin4& other) {}
-    };
-    template <> struct VMin4<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmin4(a, b);
        }

        __device__ __forceinline__ VMin4() {}
@@ -2349,40 +2014,11 @@ namespace arithm

    ////////////////////////////////////

-    template <typename T> struct VMin2;
-    template <> struct VMin2<uint> : binary_function<uint, uint, uint>
+    struct VMin2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMin2() {}
-        __device__ __forceinline__ VMin2(const VMin2& other) {}
-    };
-    template <> struct VMin2<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmin2(a, b);
        }

        __device__ __forceinline__ VMin2() {}
@@ -2392,13 +2028,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T> struct TransformFunctorTraits< arithm::VMin4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T> struct TransformFunctorTraits< arithm::VMin2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -2415,14 +2051,14 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
    }

-    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
    }

    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@@ -2430,12 +2066,6 @@ namespace arithm
        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
    }

-    template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -2463,44 +2093,11 @@ namespace arithm

 namespace arithm
 {
-    template <typename T> struct VMax4;
-    template <> struct VMax4<uint> : binary_function<uint, uint, uint>
+    struct VMax4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMax4() {}
-        __device__ __forceinline__ VMax4(const VMax4& other) {}
-    };
-    template <> struct VMax4<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmax4(a, b);
        }

        __device__ __forceinline__ VMax4() {}
@@ -2509,40 +2106,11 @@ namespace arithm

    ////////////////////////////////////

-    template <typename T> struct VMax2;
-    template <> struct VMax2<uint> : binary_function<uint, uint, uint>
+    struct VMax2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMax2() {}
-        __device__ __forceinline__ VMax2(const VMax2& other) {}
-    };
-    template <> struct VMax2<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmax2(a, b);
        }

        __device__ __forceinline__ VMax2() {}
@@ -2552,13 +2120,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T> struct TransformFunctorTraits< arithm::VMax4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T> struct TransformFunctorTraits< arithm::VMax2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -2575,14 +2143,14 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
    }

-    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
    }

    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@@ -2590,12 +2158,6 @@ namespace arithm
        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
    }

-    template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -1,934 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
-//
-// The original code was written by Paul Furgale and Chi Hay Tong
-// and later optimized and prepared for integration into OpenCV by Itseez.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/reduce.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-#include "opencv2/gpu/device/filters.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace surf
-    {
-        ////////////////////////////////////////////////////////////////////////
-        // Global parameters
-
-        // The maximum number of features (before subpixel interpolation) that memory is reserved for.
-        __constant__ int c_max_candidates;
-        // The maximum number of features that memory is reserved for.
-        __constant__ int c_max_features;
-        // The image size.
-        __constant__ int c_img_rows;
-        __constant__ int c_img_cols;
-        // The number of layers.
-        __constant__ int c_nOctaveLayers;
-        // The hessian threshold.
-        __constant__ float c_hessianThreshold;
-
-        // The current octave.
-        __constant__ int c_octave;
-        // The current layer size.
-        __constant__ int c_layer_rows;
-        __constant__ int c_layer_cols;
-
-        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
-        }
-
-        void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // Integral image texture
-
-        texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        void bindImgTex(PtrStepSzb img)
-        {
-            bindTexture(&imgTex, img);
-        }
-
-        size_t bindSumTex(PtrStepSz<uint> sum)
-        {
-            size_t offset;
-            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
-            cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
-            return offset / sizeof(uint);
-        }
-        size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
-        {
-            size_t offset;
-            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
-            cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
-            return offset / sizeof(uint);
-        }
-
-        template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
-        {
-        #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
-            typedef double real_t;
-        #else
-            typedef float  real_t;
-        #endif
-
-            float ratio = (float)newSize / oldSize;
-
-            real_t d = 0;
-
-            #pragma unroll
-            for (int k = 0; k < N; ++k)
-            {
-                int dx1 = __float2int_rn(ratio * src[k][0]);
-                int dy1 = __float2int_rn(ratio * src[k][1]);
-                int dx2 = __float2int_rn(ratio * src[k][2]);
-                int dy2 = __float2int_rn(ratio * src[k][3]);
-
-                real_t t = 0;
-                t += tex2D(sumTex, x + dx1, y + dy1);
-                t -= tex2D(sumTex, x + dx1, y + dy2);
-                t -= tex2D(sumTex, x + dx2, y + dy1);
-                t += tex2D(sumTex, x + dx2, y + dy2);
-
-                d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
-            }
-
-            return (float)d;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // Hessian
-
-        __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
-        __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
-        __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
-
-        __host__ __device__ __forceinline__ int calcSize(int octave, int layer)
-        {
-            /* Wavelet size at first layer of first octave. */
-            const int HAAR_SIZE0 = 9;
-
-            /* Wavelet size increment between layers. This should be an even number,
-             such that the wavelet sizes in an octave are either all even or all odd.
-             This ensures that when looking for the neighbours of a sample, the layers
-             above and below are aligned correctly. */
-            const int HAAR_SIZE_INC = 6;
-
-            return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
-        }
-
-        __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
-        {
-            // Determine the indices
-            const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
-            const int blockIdx_y = blockIdx.y % gridDim_y;
-            const int blockIdx_z = blockIdx.y / gridDim_y;
-
-            const int j = threadIdx.x + blockIdx.x * blockDim.x;
-            const int i = threadIdx.y + blockIdx_y * blockDim.y;
-            const int layer = blockIdx_z;
-
-            const int size = calcSize(c_octave, layer);
-
-            const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
-            const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
-
-            // Ignore pixels where some of the kernel is outside the image
-            const int margin = (size >> 1) >> c_octave;
-
-            if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
-            {
-                const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
-                const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
-                const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
-
-                det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
-                trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
-            }
-        }
-
-        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
-            int octave, int nOctaveLayers)
-        {
-            const int min_size = calcSize(octave, 0);
-            const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
-            const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
-
-            dim3 threads(16, 16);
-
-            dim3 grid;
-            grid.x = divUp(max_samples_j, threads.x);
-            grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
-
-            icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // NONMAX
-
-        __constant__ float c_DM[5] = {0, 0, 9, 9, 1};
-
-        struct WithMask
-        {
-            static __device__ bool check(int sum_i, int sum_j, int size)
-            {
-                float ratio = (float)size / 9.0f;
-
-                float d = 0;
-
-                int dx1 = __float2int_rn(ratio * c_DM[0]);
-                int dy1 = __float2int_rn(ratio * c_DM[1]);
-                int dx2 = __float2int_rn(ratio * c_DM[2]);
-                int dy2 = __float2int_rn(ratio * c_DM[3]);
-
-                float t = 0;
-                t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
-                t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
-                t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
-                t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
-
-                d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
-
-                return (d >= 0.5f);
-            }
-        };
-
-        template <typename Mask>
-        __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
-            unsigned int* maxCounter)
-        {
-            #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
-
-            extern __shared__ float N9[];
-
-            // The hidx variables are the indices to the hessian buffer.
-            const int gridDim_y = gridDim.y / c_nOctaveLayers;
-            const int blockIdx_y = blockIdx.y % gridDim_y;
-            const int blockIdx_z = blockIdx.y / gridDim_y;
-
-            const int layer = blockIdx_z + 1;
-
-            const int size = calcSize(c_octave, layer);
-
-            // Ignore pixels without a 3x3x3 neighbourhood in the layer above
-            const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
-
-            const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
-            const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
-
-            // Is this thread within the hessian buffer?
-            const int zoff = blockDim.x * blockDim.y;
-            const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
-            N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
-            N9[localLin       ] = det.ptr(c_layer_rows * (layer    ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
-            N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
-            __syncthreads();
-
-            if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
-            {
-                float val0 = N9[localLin];
-
-                if (val0 > c_hessianThreshold)
-                {
-                    // Coordinates for the start of the wavelet in the sum image. There
-                    // is some integer division involved, so don't try to simplify this
-                    // (cancel out sampleStep) without checking the result is the same
-                    const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
-                    const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
-
-                    if (Mask::check(sum_i, sum_j, size))
-                    {
-                        // Check to see if we have a max (in its 26 neighbours)
-                        const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
-                        &&                   val0 > N9[localLin     - blockDim.x - zoff]
-                        &&                   val0 > N9[localLin + 1 - blockDim.x - zoff]
-                        &&                   val0 > N9[localLin - 1              - zoff]
-                        &&                   val0 > N9[localLin                  - zoff]
-                        &&                   val0 > N9[localLin + 1              - zoff]
-                        &&                   val0 > N9[localLin - 1 + blockDim.x - zoff]
-                        &&                   val0 > N9[localLin     + blockDim.x - zoff]
-                        &&                   val0 > N9[localLin + 1 + blockDim.x - zoff]
-
-                        &&                   val0 > N9[localLin - 1 - blockDim.x]
-                        &&                   val0 > N9[localLin     - blockDim.x]
-                        &&                   val0 > N9[localLin + 1 - blockDim.x]
-                        &&                   val0 > N9[localLin - 1             ]
-                        &&                   val0 > N9[localLin + 1             ]
-                        &&                   val0 > N9[localLin - 1 + blockDim.x]
-                        &&                   val0 > N9[localLin     + blockDim.x]
-                        &&                   val0 > N9[localLin + 1 + blockDim.x]
-
-                        &&                   val0 > N9[localLin - 1 - blockDim.x + zoff]
-                        &&                   val0 > N9[localLin     - blockDim.x + zoff]
-                        &&                   val0 > N9[localLin + 1 - blockDim.x + zoff]
-                        &&                   val0 > N9[localLin - 1              + zoff]
-                        &&                   val0 > N9[localLin                  + zoff]
-                        &&                   val0 > N9[localLin + 1              + zoff]
-                        &&                   val0 > N9[localLin - 1 + blockDim.x + zoff]
-                        &&                   val0 > N9[localLin     + blockDim.x + zoff]
-                        &&                   val0 > N9[localLin + 1 + blockDim.x + zoff]
-                        ;
-
-                        if(condmax)
-                        {
-                            unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
-
-                            if (ind < c_max_candidates)
-                            {
-                                const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
-
-                                maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
-                            }
-                        }
-                    }
-                }
-            }
-
-            #endif
-        }
-
-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
-            int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
-        {
-            const int layer_rows = img_rows >> octave;
-            const int layer_cols = img_cols >> octave;
-
-            const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
-
-            dim3 threads(16, 16);
-
-            dim3 grid;
-            grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
-            grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
-
-            const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
-
-            if (use_mask)
-                icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
-            else
-                icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // INTERPOLATION
-
-        __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
-            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
-            unsigned int* featureCounter)
-        {
-            #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
-
-            const int4 maxPos = maxPosBuffer[blockIdx.x];
-
-            const int j = maxPos.x - 1 + threadIdx.x;
-            const int i = maxPos.y - 1 + threadIdx.y;
-            const int layer = maxPos.z - 1 + threadIdx.z;
-
-            __shared__ float N9[3][3][3];
-
-            N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
-            __syncthreads();
-
-            if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
-            {
-                __shared__ float dD[3];
-
-                //dx
-                dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
-                //dy
-                dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
-                //ds
-                dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
-
-                __shared__ float H[3][3];
-
-                //dxx
-                H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
-                //dxy
-                H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
-                //dxs
-                H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
-                //dyx = dxy
-                H[1][0] = H[0][1];
-                //dyy
-                H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
-                //dys
-                H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
-                //dsx = dxs
-                H[2][0] = H[0][2];
-                //dsy = dys
-                H[2][1] = H[1][2];
-                //dss
-                H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
-
-                __shared__ float x[3];
-
-                if (solve3x3(H, dD, x))
-                {
-                    if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
-                    {
-                        // if the step is within the interpolation region, perform it
-
-                        const int size = calcSize(c_octave, maxPos.z);
-
-                        const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
-                        const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
-
-                        const float center_i = sum_i + (float)(size - 1) / 2;
-                        const float center_j = sum_j + (float)(size - 1) / 2;
-
-                        const float px = center_j + x[0] * (1 << c_octave);
-                        const float py = center_i + x[1] * (1 << c_octave);
-
-                        const int ds = size - calcSize(c_octave, maxPos.z - 1);
-                        const float psize = roundf(size + x[2] * ds);
-
-                        /* The sampling intervals and wavelet sized for selecting an orientation
-                         and building the keypoint descriptor are defined relative to 's' */
-                        const float s = psize * 1.2f / 9.0f;
-
-                        /* To find the dominant orientation, the gradients in x and y are
-                         sampled in a circle of radius 6s using wavelets of size 4s.
-                         We ensure the gradient wavelet size is even to ensure the
-                         wavelet pattern is balanced and symmetric around its center */
-                        const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
-
-                        // check when grad_wav_size is too big
-                        if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
-                        {
-                            // Get a new feature index.
-                            unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
-
-                            if (ind < c_max_features)
-                            {
-                                featureX[ind] = px;
-                                featureY[ind] = py;
-                                featureLaplacian[ind] = maxPos.w;
-                                featureOctave[ind] = c_octave;
-                                featureSize[ind] = psize;
-                                featureHessian[ind] = N9[1][1][1];
-                            }
-                        } // grad_wav_size check
-                    } // If the subpixel interpolation worked
-                }
-            } // If this is thread 0.
-
-            #endif
-        }
-
-        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
-            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
-            unsigned int* featureCounter)
-        {
-            dim3 threads;
-            threads.x = 3;
-            threads.y = 3;
-            threads.z = 3;
-
-            dim3 grid;
-            grid.x = maxCounter;
-
-            icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureOctave, featureSize, featureHessian, featureCounter);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // Orientation
-
-        #define ORI_SEARCH_INC 5
-        #define ORI_WIN        60
-        #define ORI_SAMPLES    113
-
-        __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
-        __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
-        __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
-
-        __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
-        __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
-
-        __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
-        {
-            __shared__ float s_X[128];
-            __shared__ float s_Y[128];
-            __shared__ float s_angle[128];
-
-            __shared__ float s_sumx[32 * 4];
-            __shared__ float s_sumy[32 * 4];
-
-            /* The sampling intervals and wavelet sized for selecting an orientation
-             and building the keypoint descriptor are defined relative to 's' */
-            const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
-
-            /* To find the dominant orientation, the gradients in x and y are
-             sampled in a circle of radius 6s using wavelets of size 4s.
-             We ensure the gradient wavelet size is even to ensure the
-             wavelet pattern is balanced and symmetric around its center */
-            const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
-
-            // check when grad_wav_size is too big
-            if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
-                return;
-
-            // Calc X, Y, angle and store it to shared memory
-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-            float X = 0.0f, Y = 0.0f, angle = 0.0f;
-
-            if (tid < ORI_SAMPLES)
-            {
-                const float margin = (float)(grad_wav_size - 1) / 2.0f;
-                const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
-                const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
-
-                if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
-                    x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
-                {
-                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
-                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
-
-                    angle = atan2f(Y, X);
-                    if (angle < 0)
-                        angle += 2.0f * CV_PI_F;
-                    angle *= 180.0f / CV_PI_F;
-                }
-            }
-            s_X[tid] = X;
-            s_Y[tid] = Y;
-            s_angle[tid] = angle;
-            __syncthreads();
-
-            float bestx = 0, besty = 0, best_mod = 0;
-
-        #if __CUDA_ARCH__ >= 200
-            #pragma unroll
-        #endif
-            for (int i = 0; i < 18; ++i)
-            {
-                const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
-
-                float sumx = 0.0f, sumy = 0.0f;
-                int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
-                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                {
-                    sumx = s_X[threadIdx.x];
-                    sumy = s_Y[threadIdx.x];
-                }
-                d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
-                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                {
-                    sumx += s_X[threadIdx.x + 32];
-                    sumy += s_Y[threadIdx.x + 32];
-                }
-                d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
-                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                {
-                    sumx += s_X[threadIdx.x + 64];
-                    sumy += s_Y[threadIdx.x + 64];
-                }
-                d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
-                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                {
-                    sumx += s_X[threadIdx.x + 96];
-                    sumy += s_Y[threadIdx.x + 96];
-                }
-
-                plus<float> op;
-                device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
-                                   thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));
-
-                const float temp_mod = sumx * sumx + sumy * sumy;
-                if (temp_mod > best_mod)
-                {
-                    best_mod = temp_mod;
-                    bestx = sumx;
-                    besty = sumy;
-                }
-
-                __syncthreads();
-            }
-
-            if (threadIdx.x == 0)
-            {
-                s_X[threadIdx.y] = bestx;
-                s_Y[threadIdx.y] = besty;
-                s_angle[threadIdx.y] = best_mod;
-            }
-            __syncthreads();
-
-            if (threadIdx.x == 0 && threadIdx.y == 0)
-            {
-                int bestIdx = 0;
-
-                if (s_angle[1] > s_angle[bestIdx])
-                    bestIdx = 1;
-                if (s_angle[2] > s_angle[bestIdx])
-                    bestIdx = 2;
-                if (s_angle[3] > s_angle[bestIdx])
-                    bestIdx = 3;
-
-                float kp_dir = atan2f(s_Y[bestIdx], s_X[bestIdx]);
-                if (kp_dir < 0)
-                    kp_dir += 2.0f * CV_PI_F;
-                kp_dir *= 180.0f / CV_PI_F;
-
-                kp_dir = 360.0f - kp_dir;
-                if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
-                    kp_dir = 0.f;
-
-                featureDir[blockIdx.x] = kp_dir;
-            }
-        }
-
-        #undef ORI_SEARCH_INC
-        #undef ORI_WIN
-        #undef ORI_SAMPLES
-
-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
-        {
-            dim3 threads;
-            threads.x = 32;
-            threads.y = 4;
-
-            dim3 grid;
-            grid.x = nFeatures;
-
-            icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // Descriptors
-
-        #define PATCH_SZ 20
-
-        __constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
-        {
-            3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
-            8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
-            1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
-            3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
-            5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
-            9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
-            0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
-            0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
-            0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
-            0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
-            0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
-            0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
-            0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
-            0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
-            9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
-            5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
-            3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
-            1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
-            8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
-            3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
-        };
-
-        struct WinReader
-        {
-            typedef uchar elem_type;
-
-            __device__ __forceinline__ uchar operator ()(int i, int j) const
-            {
-                float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
-                float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
-
-                return tex2D(imgTex, pixel_x, pixel_y);
-            }
-
-            float centerX;
-            float centerY;
-            float win_offset;
-            float cos_dir;
-            float sin_dir;
-            int width;
-            int height;
-        };
-
-        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
-                                   float& dx, float& dy)
-        {
-            __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
-
-            dx = dy = 0.0f;
-
-            WinReader win;
-
-            win.centerX = featureX[blockIdx.x];
-            win.centerY = featureY[blockIdx.x];
-
-            // The sampling intervals and wavelet sized for selecting an orientation
-            // and building the keypoint descriptor are defined relative to 's'
-            const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
-
-            // Extract a window of pixels around the keypoint of size 20s
-            const int win_size = (int)((PATCH_SZ + 1) * s);
-
-            win.width = win.height = win_size;
-
-            // Nearest neighbour version (faster)
-            win.win_offset = -(win_size - 1.0f) / 2.0f;
-
-            float descriptor_dir = 360.0f - featureDir[blockIdx.x];
-            if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
-                descriptor_dir = 0.f;
-            descriptor_dir *= CV_PI_F / 180.0f;
-            sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);
-
-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-            const int xLoadInd = tid % (PATCH_SZ + 1);
-            const int yLoadInd = tid / (PATCH_SZ + 1);
-
-            if (yLoadInd < (PATCH_SZ + 1))
-            {
-                if (s > 1)
-                {
-                    AreaFilter<WinReader> filter(win, s, s);
-                    s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
-                }
-                else
-                {
-                    LinearFilter<WinReader> filter(win);
-                    s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
-                }
-            }
-
-            __syncthreads();
-
-            const int xPatchInd = threadIdx.x % 5;
-            const int yPatchInd = threadIdx.x / 5;
-
-            if (yPatchInd < 5)
-            {
-                const int xBlockInd = threadIdx.y % 4;
-                const int yBlockInd = threadIdx.y / 4;
-
-                const int xInd = xBlockInd * 5 + xPatchInd;
-                const int yInd = yBlockInd * 5 + yPatchInd;
-
-                const float dw = c_DW[yInd * PATCH_SZ + xInd];
-
-                dx = (s_PATCH[yInd    ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd    ]) * dw;
-                dy = (s_PATCH[yInd + 1][xInd    ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd    ][xInd + 1]) * dw;
-            }
-        }
-
-        __global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
-        {
-            __shared__ float smem[32 * 16];
-
-            float* sRow = smem + threadIdx.y * 32;
-
-            float dx, dy;
-            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
-
-            float dxabs = ::fabsf(dx);
-            float dyabs = ::fabsf(dy);
-
-            plus<float> op;
-
-            reduce<32>(sRow, dx, threadIdx.x, op);
-            reduce<32>(sRow, dy, threadIdx.x, op);
-            reduce<32>(sRow, dxabs, threadIdx.x, op);
-            reduce<32>(sRow, dyabs, threadIdx.x, op);
-
-            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
-
-            // write dx, dy, |dx|, |dy|
-            if (threadIdx.x == 0)
-                *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
-        }
-
-        __global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
-        {
-            __shared__ float smem[32 * 16];
-
-            float* sRow = smem + threadIdx.y * 32;
-
-            float dx, dy;
-            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
-
-            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
-
-            plus<float> op;
-
-            float d1 = 0.0f;
-            float d2 = 0.0f;
-            float abs1 = 0.0f;
-            float abs2 = 0.0f;
-
-            if (dy >= 0)
-            {
-                d1 = dx;
-                abs1 = ::fabsf(dx);
-            }
-            else
-            {
-                d2 = dx;
-                abs2 = ::fabsf(dx);
-            }
-
-            reduce<32>(sRow, d1, threadIdx.x, op);
-            reduce<32>(sRow, d2, threadIdx.x, op);
-            reduce<32>(sRow, abs1, threadIdx.x, op);
-            reduce<32>(sRow, abs2, threadIdx.x, op);
-
-            // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
-            if (threadIdx.x == 0)
-                descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
-
-            if (dx >= 0)
-            {
-                d1 = dy;
-                abs1 = ::fabsf(dy);
-                d2 = 0.0f;
-                abs2 = 0.0f;
-            }
-            else
-            {
-                d1 = 0.0f;
-                abs1 = 0.0f;
-                d2 = dy;
-                abs2 = ::fabsf(dy);
-            }
-
-            reduce<32>(sRow, d1, threadIdx.x, op);
-            reduce<32>(sRow, d2, threadIdx.x, op);
-            reduce<32>(sRow, abs1, threadIdx.x, op);
-            reduce<32>(sRow, abs2, threadIdx.x, op);
-
-            // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
-            if (threadIdx.x == 0)
-                descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
-        }
-
-        template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
-        {
-            __shared__ float smem[BLOCK_DIM_X];
-            __shared__ float s_len;
-
-            // no need for thread ID
-            float* descriptor_base = descriptors.ptr(blockIdx.x);
-
-            // read in the unnormalized descriptor values (squared)
-            const float val = descriptor_base[threadIdx.x];
-
-            float len = val * val;
-            reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());
-
-            if (threadIdx.x == 0)
-                s_len = ::sqrtf(len);
-
-            __syncthreads();
-
-            // normalize and store in output
-            descriptor_base[threadIdx.x] = val / s_len;
-        }
-
-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
-        {
-            // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
-
-            if (descriptors.cols == 64)
-            {
-                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-            else
-            {
-                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-                normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        }
-    } // namespace surf
-}}} // namespace cv { namespace gpu { namespace device
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -263,11 +263,8 @@ namespace

 namespace arithm
 {
-    template <typename T, typename D>
-    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void addMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void addMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);

    template <typename T, typename D>
    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -345,62 +342,6 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
        }
    };

-    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const vfunc_t vfuncs4[4][4] =
-    {
-        {
-            vadd4<unsigned int, unsigned int>,
-            vadd4<unsigned int, int>,
-            0,
-            0
-        },
-        {
-            vadd4<int, unsigned int>,
-            vadd4<int, int>,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        }
-    };
-    static const vfunc_t vfuncs2[4][4] =
-    {
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            vadd2<unsigned int, unsigned int>,
-            vadd2<unsigned int, int>
-        },
-        {
-            0,
-            0,
-            vadd2<int, unsigned int>,
-            vadd2<int, int>
-        }
-    };
-
    if (dtype < 0)
        dtype = src1.depth();

@@ -426,7 +367,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
+    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -434,31 +375,27 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
-            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
@@ -606,11 +543,8 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat

 namespace arithm
 {
-    template <typename T, typename D>
-    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void subMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void subMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);

    template <typename T, typename D>
    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -688,62 +622,6 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
        }
    };

-    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const vfunc_t vfuncs4[4][4] =
-    {
-        {
-            vsub4<unsigned int, unsigned int>,
-            vsub4<unsigned int, int>,
-            0,
-            0
-        },
-        {
-            vsub4<int, unsigned int>,
-            vsub4<int, int>,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        }
-    };
-    static const vfunc_t vfuncs2[4][4] =
-    {
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            vsub2<unsigned int, unsigned int>,
-            vsub2<unsigned int, int>
-        },
-        {
-            0,
-            0,
-            vsub2<int, unsigned int>,
-            vsub2<int, int>
-        }
-    };
-
    if (dtype < 0)
        dtype = src1.depth();

@@ -769,7 +647,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
+    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -777,31 +655,27 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
-            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                subMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                subMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
@@ -1585,11 +1459,8 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St

 namespace arithm
 {
-    template <typename T>
-    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void absDiffMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void absDiffMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);

    template <typename T>
    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1610,20 +1481,6 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
        absDiffMat<float>,
        absDiffMat<double>
    };
-    static const func_t vfuncs4[] =
-    {
-        vabsDiff4<unsigned int>,
-        vabsDiff4<int>,
-        0,
-        0
-    };
-    static const func_t vfuncs2[] =
-    {
-        0,
-        0,
-        vabsDiff2<unsigned int>,
-        vabsDiff2<int>
-    };

    const int depth = src1.depth();
    const int cn = src1.channels();
@@ -1645,7 +1502,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (depth < CV_32S)
+    if (depth == CV_8U || depth == CV_16U)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -1653,31 +1510,27 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const func_t vfunc4 = vfuncs4[depth];
-            const func_t vfunc2 = vfuncs2[depth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                              stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                              stream);

                return;
            }
@@ -1940,6 +1793,11 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)

 namespace arithm
 {
+    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+
    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1962,6 +1820,12 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        {cmpMatEq<double>        , cmpMatNe<double>        , cmpMatLt<double>        , cmpMatLe<double>        }
    };

+    typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    static const func_v4_t funcs_v4[] =
+    {
+        cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
+    };
+
    const int depth = src1.depth();
    const int cn = src1.channels();

@@ -1997,6 +1861,27 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
    PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

+    if (depth == CV_8U && (src1_.cols & 3) == 0)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            const int vcols = src1_.cols >> 2;
+
+            funcs_v4[code](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                           PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                           PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                           stream);
+
+            return;
+        }
+    }
+
    const func_t func = funcs[depth][code];

    func(src1_, src2_, dst_, stream);
@@ -2532,13 +2417,13 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st

 namespace arithm
 {
-    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void minMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void minMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);

-    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void maxMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void maxMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 }
@@ -2558,20 +2443,6 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
        minMat<float>,
        minMat<double>
    };
-    static const func_t vfuncs4[] =
-    {
-        vmin4<unsigned int>,
-        vmin4<int>,
-        0,
-        0
-    };
-    static const func_t vfuncs2[] =
-    {
-        0,
-        0,
-        vmin2<unsigned int>,
-        vmin2<int>
-    };

    const int depth = src1.depth();
    const int cn = src1.channels();
@@ -2593,7 +2464,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (depth < CV_32S)
+    if (depth == CV_8U || depth == CV_16U)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2601,31 +2472,27 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const func_t vfunc4 = vfuncs4[depth];
-            const func_t vfunc2 = vfuncs2[depth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                minMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                minMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
@@ -2655,20 +2522,6 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
        maxMat<float>,
        maxMat<double>
    };
-    static const func_t vfuncs4[] =
-    {
-        vmax4<unsigned int>,
-        vmax4<int>,
-        0,
-        0
-    };
-    static const func_t vfuncs2[] =
-    {
-        0,
-        0,
-        vmax2<unsigned int>,
-        vmax2<int>
-    };

    const int depth = src1.depth();
    const int cn = src1.channels();
@@ -2690,7 +2543,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (depth < CV_32S)
+    if (depth == CV_8U || depth == CV_16U)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2698,31 +2551,27 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const func_t vfunc4 = vfuncs4[depth];
-            const func_t vfunc2 = vfuncs2[depth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                maxMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                maxMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -1,418 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); }
-cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float, bool) { throw_nogpu(); }
-int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;}
-void cv::gpu::SURF_GPU::uploadKeypoints(const std::vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, std::vector<float>&) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, std::vector<float>&, bool) { throw_nogpu(); }
-void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace surf
-    {
-        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
-        void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
-
-        void bindImgTex(PtrStepSzb img);
-        size_t bindSumTex(PtrStepSz<unsigned int> sum);
-        size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
-
-        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
-            int octave, int nOctaveLayer);
-
-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
-            int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
-
-        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
-            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
-            unsigned int* featureCounter);
-
-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
-
-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
-    }
-}}}
-
-using namespace ::cv::gpu::device::surf;
-
-namespace
-{
-    int calcSize(int octave, int layer)
-    {
-        /* Wavelet size at first layer of first octave. */
-        const int HAAR_SIZE0 = 9;
-
-        /* Wavelet size increment between layers. This should be an even number,
-         such that the wavelet sizes in an octave are either all even or all odd.
-         This ensures that when looking for the neighbours of a sample, the layers
-
-         above and below are aligned correctly. */
-        const int HAAR_SIZE_INC = 6;
-
-        return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
-    }
-
-    class SURF_GPU_Invoker
-    {
-    public:
-        SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) :
-            surf_(surf),
-            img_cols(img.cols), img_rows(img.rows),
-            use_mask(!mask.empty())
-        {
-            CV_Assert(!img.empty() && img.type() == CV_8UC1);
-            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
-            CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
-
-            const int min_size = calcSize(surf_.nOctaves - 1, 0);
-            CV_Assert(img_rows - min_size >= 0);
-            CV_Assert(img_cols - min_size >= 0);
-
-            const int layer_rows = img_rows >> (surf_.nOctaves - 1);
-            const int layer_cols = img_cols >> (surf_.nOctaves - 1);
-            const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
-            CV_Assert(layer_rows - 2 * min_margin > 0);
-            CV_Assert(layer_cols - 2 * min_margin > 0);
-
-            maxFeatures = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
-            maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
-
-            CV_Assert(maxFeatures > 0);
-
-            counters.create(1, surf_.nOctaves + 1, CV_32SC1);
-            counters.setTo(Scalar::all(0));
-
-            loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
-
-            bindImgTex(img);
-
-            integralBuffered(img, surf_.sum, surf_.intBuffer);
-            sumOffset = bindSumTex(surf_.sum);
-
-            if (use_mask)
-            {
-                min(mask, 1.0, surf_.mask1);
-                integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer);
-                maskOffset = bindMaskSumTex(surf_.maskSum);
-            }
-        }
-
-        void detectKeypoints(GpuMat& keypoints)
-        {
-            ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
-            ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
-
-            ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
-            ensureSizeIsEnough(SURF_GPU::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
-            keypoints.setTo(Scalar::all(0));
-
-            for (int octave = 0; octave < surf_.nOctaves; ++octave)
-            {
-                const int layer_rows = img_rows >> octave;
-                const int layer_cols = img_cols >> octave;
-
-                loadOctaveConstants(octave, layer_rows, layer_cols);
-
-                icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
-
-                icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
-                    img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
-
-                unsigned int maxCounter;
-                cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
-                maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
-
-                if (maxCounter > 0)
-                {
-                    icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer.ptr<int4>(), maxCounter,
-                        keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
-                        keypoints.ptr<int>(SURF_GPU::LAPLACIAN_ROW), keypoints.ptr<int>(SURF_GPU::OCTAVE_ROW),
-                        keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::HESSIAN_ROW),
-                        counters.ptr<unsigned int>());
-                }
-            }
-            unsigned int featureCounter;
-            cudaSafeCall( cudaMemcpy(&featureCounter, counters.ptr<unsigned int>(), sizeof(unsigned int), cudaMemcpyDeviceToHost) );
-            featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
-
-            keypoints.cols = featureCounter;
-
-            if (surf_.upright)
-                keypoints.row(SURF_GPU::ANGLE_ROW).setTo(Scalar::all(360.0 - 90.0));
-            else
-                findOrientation(keypoints);
-        }
-
-        void findOrientation(GpuMat& keypoints)
-        {
-            const int nFeatures = keypoints.cols;
-            if (nFeatures > 0)
-            {
-                icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
-                    keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::ANGLE_ROW), nFeatures);
-            }
-        }
-
-        void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
-        {
-            const int nFeatures = keypoints.cols;
-            if (nFeatures > 0)
-            {
-                ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
-                compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
-                    keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::ANGLE_ROW), nFeatures);
-            }
-        }
-
-    private:
-        SURF_GPU& surf_;
-
-        int img_cols, img_rows;
-
-        bool use_mask;
-
-        int maxCandidates;
-        int maxFeatures;
-
-        size_t maskOffset;
-        size_t sumOffset;
-
-        GpuMat counters;
-    };
-}
-
-cv::gpu::SURF_GPU::SURF_GPU()
-{
-    hessianThreshold = 100;
-    extended = true;
-    nOctaves = 4;
-    nOctaveLayers = 2;
-    keypointsRatio = 0.01f;
-    upright = false;
-}
-
-cv::gpu::SURF_GPU::SURF_GPU(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
-{
-    hessianThreshold = _threshold;
-    extended = _extended;
-    nOctaves = _nOctaves;
-    nOctaveLayers = _nOctaveLayers;
-    keypointsRatio = _keypointsRatio;
-    upright = _upright;
-}
-
-int cv::gpu::SURF_GPU::descriptorSize() const
-{
-    return extended ? 128 : 64;
-}
-
-void cv::gpu::SURF_GPU::uploadKeypoints(const std::vector<KeyPoint>& keypoints, GpuMat& keypointsGPU)
-{
-    if (keypoints.empty())
-        keypointsGPU.release();
-    else
-    {
-        Mat keypointsCPU(SURF_GPU::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
-
-        float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::X_ROW);
-        float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::Y_ROW);
-        int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::LAPLACIAN_ROW);
-        int* kp_octave = keypointsCPU.ptr<int>(SURF_GPU::OCTAVE_ROW);
-        float* kp_size = keypointsCPU.ptr<float>(SURF_GPU::SIZE_ROW);
-        float* kp_dir = keypointsCPU.ptr<float>(SURF_GPU::ANGLE_ROW);
-        float* kp_hessian = keypointsCPU.ptr<float>(SURF_GPU::HESSIAN_ROW);
-
-        for (size_t i = 0, size = keypoints.size(); i < size; ++i)
-        {
-            const KeyPoint& kp = keypoints[i];
-            kp_x[i] = kp.pt.x;
-            kp_y[i] = kp.pt.y;
-            kp_octave[i] = kp.octave;
-            kp_size[i] = kp.size;
-            kp_dir[i] = kp.angle;
-            kp_hessian[i] = kp.response;
-            kp_laplacian[i] = 1;
-        }
-
-        keypointsGPU.upload(keypointsCPU);
-    }
-}
-
-void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, std::vector<KeyPoint>& keypoints)
-{
-    const int nFeatures = keypointsGPU.cols;
-
-    if (nFeatures == 0)
-        keypoints.clear();
-    else
-    {
-        CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
-
-        Mat keypointsCPU(keypointsGPU);
-
-        keypoints.resize(nFeatures);
-
-        float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::X_ROW);
-        float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::Y_ROW);
-        int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::LAPLACIAN_ROW);
-        int* kp_octave = keypointsCPU.ptr<int>(SURF_GPU::OCTAVE_ROW);
-        float* kp_size = keypointsCPU.ptr<float>(SURF_GPU::SIZE_ROW);
-        float* kp_dir = keypointsCPU.ptr<float>(SURF_GPU::ANGLE_ROW);
-        float* kp_hessian = keypointsCPU.ptr<float>(SURF_GPU::HESSIAN_ROW);
-
-        for (int i = 0; i < nFeatures; ++i)
-        {
-            KeyPoint& kp = keypoints[i];
-            kp.pt.x = kp_x[i];
-            kp.pt.y = kp_y[i];
-            kp.class_id = kp_laplacian[i];
-            kp.octave = kp_octave[i];
-            kp.size = kp_size[i];
-            kp.angle = kp_dir[i];
-            kp.response = kp_hessian[i];
-        }
-    }
-}
-
-void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat& descriptorsGPU, std::vector<float>& descriptors)
-{
-    if (descriptorsGPU.empty())
-        descriptors.clear();
-    else
-    {
-        CV_Assert(descriptorsGPU.type() == CV_32F);
-
-        descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
-        Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
-        descriptorsGPU.download(descriptorsCPU);
-    }
-}
-
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
-{
-    if (!img.empty())
-    {
-        SURF_GPU_Invoker surf(*this, img, mask);
-
-        surf.detectKeypoints(keypoints);
-    }
-}
-
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
-                                   bool useProvidedKeypoints)
-{
-    if (!img.empty())
-    {
-        SURF_GPU_Invoker surf(*this, img, mask);
-
-        if (!useProvidedKeypoints)
-            surf.detectKeypoints(keypoints);
-        else if (!upright)
-        {
-            surf.findOrientation(keypoints);
-        }
-
-        surf.computeDescriptors(keypoints, descriptors, descriptorSize());
-    }
-}
-
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    GpuMat keypointsGPU;
-
-    (*this)(img, mask, keypointsGPU);
-
-    downloadKeypoints(keypointsGPU, keypoints);
-}
-
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints,
-    GpuMat& descriptors, bool useProvidedKeypoints)
-{
-    GpuMat keypointsGPU;
-
-    if (useProvidedKeypoints)
-        uploadKeypoints(keypoints, keypointsGPU);
-
-    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
-
-    downloadKeypoints(keypointsGPU, keypoints);
-}
-
-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints,
-    std::vector<float>& descriptors, bool useProvidedKeypoints)
-{
-    GpuMat descriptorsGPU;
-
-    (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
-
-    downloadDescriptors(descriptorsGPU, descriptors);
-}
-
-void cv::gpu::SURF_GPU::releaseMemory()
-{
-    sum.release();
-    mask1.release();
-    maskSum.release();
-    intBuffer.release();
-    det.release();
-    trace.release();
-    maxPosBuffer.release();
-}
-
-#endif /* !defined (HAVE_CUDA) */