renamed gpuwarping -> cudawarping

2013-07-23 15:57:59 +04:00
parent dcd600cc47
commit d4901a99ea
52 changed files with 45 additions and 45 deletions
--- a/modules/cudawarping/src/cuda/build_warp_maps.cu
+++ b/modules/cudawarping/src/cuda/build_warp_maps.cu
@@ -0,0 +1,221 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        // TODO use intrinsics like __sinf and so on
+
+        namespace build_warp_maps
+        {
+
+            __constant__ float ck_rinv[9];
+            __constant__ float cr_kinv[9];
+            __constant__ float ct[3];
+            __constant__ float cscale;
+        }
+
+
+        class PlaneMapper
+        {
+        public:
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            {
+                using namespace build_warp_maps;
+
+                float x_ = u / cscale - ct[0];
+                float y_ = v / cscale - ct[1];
+
+                float z;
+                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
+                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
+                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
+
+                x /= z;
+                y /= z;
+            }
+        };
+
+
+        class CylindricalMapper
+        {
+        public:
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            {
+                using namespace build_warp_maps;
+
+                u /= cscale;
+                float x_ = ::sinf(u);
+                float y_ = v / cscale;
+                float z_ = ::cosf(u);
+
+                float z;
+                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+                if (z > 0) { x /= z; y /= z; }
+                else x = y = -1;
+            }
+        };
+
+
+        class SphericalMapper
+        {
+        public:
+            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+            {
+                using namespace build_warp_maps;
+
+                v /= cscale;
+                u /= cscale;
+
+                float sinv = ::sinf(v);
+                float x_ = sinv * ::sinf(u);
+                float y_ = -::cosf(v);
+                float z_ = sinv * ::cosf(u);
+
+                float z;
+                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+                if (z > 0) { x /= z; y /= z; }
+                else x = y = -1;
+            }
+        };
+
+
+        template <typename Mapper>
+        __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
+                                            PtrStepf map_x, PtrStepf map_y)
+        {
+            int du = blockIdx.x * blockDim.x + threadIdx.x;
+            int dv = blockIdx.y * blockDim.y + threadIdx.y;
+            if (du < cols && dv < rows)
+            {
+                float u = tl_u + du;
+                float v = tl_v + dv;
+                float x, y;
+                Mapper::mapBackward(u, v, x, y);
+                map_x.ptr(dv)[du] = x;
+                map_y.ptr(dv)[du] = y;
+            }
+        }
+
+
+        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                const float k_rinv[9], const float r_kinv[9], const float t[3],
+                                float scale, cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
+
+            int cols = map_x.cols;
+            int rows = map_x.rows;
+
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                      const float k_rinv[9], const float r_kinv[9], float scale,
+                                      cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
+
+            int cols = map_x.cols;
+            int rows = map_x.rows;
+
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                    const float k_rinv[9], const float r_kinv[9], float scale,
+                                    cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
+            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
+
+            int cols = map_x.cols;
+            int rows = map_x.rows;
+
+            dim3 threads(32, 8);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudawarping/src/cuda/pyr_down.cu
+++ b/modules/cudawarping/src/cuda/pyr_down.cu
@@ -0,0 +1,228 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
+
+            __shared__ work_t smem[256 + 4];
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y;
+
+            const int src_y = 2 * y;
+
+            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, x);
+                    sum = sum + 0.25f   * src(src_y - 1, x);
+                    sum = sum + 0.375f  * src(src_y    , x);
+                    sum = sum + 0.25f   * src(src_y + 1, x);
+                    sum = sum + 0.0625f * src(src_y + 2, x);
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, left_x);
+                    sum = sum + 0.25f   * src(src_y - 1, left_x);
+                    sum = sum + 0.375f  * src(src_y    , left_x);
+                    sum = sum + 0.25f   * src(src_y + 1, left_x);
+                    sum = sum + 0.0625f * src(src_y + 2, left_x);
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, right_x);
+                    sum = sum + 0.25f   * src(src_y - 1, right_x);
+                    sum = sum + 0.375f  * src(src_y    , right_x);
+                    sum = sum + 0.25f   * src(src_y + 1, right_x);
+                    sum = sum + 0.0625f * src(src_y + 2, right_x);
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+            else
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+
+            __syncthreads();
+
+            if (threadIdx.x < 128)
+            {
+                const int tid2 = threadIdx.x * 2;
+
+                work_t sum;
+
+                sum =       0.0625f * smem[2 + tid2 - 2];
+                sum = sum + 0.25f   * smem[2 + tid2 - 1];
+                sum = sum + 0.375f  * smem[2 + tid2    ];
+                sum = sum + 0.25f   * smem[2 + tid2 + 1];
+                sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+                if (dst_x < dst_cols)
+                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+            }
+        }
+
+        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(src.cols, block.x), dst.rows);
+
+            B<T> b(src.rows, src.cols);
+
+            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudawarping/src/cuda/pyr_up.cu
+++ b/modules/cudawarping/src/cuda/pyr_up.cu
@@ -0,0 +1,196 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            __shared__ sum_t s_srcPatch[10][10];
+            __shared__ sum_t s_dstPatch[20][16];
+
+            if (threadIdx.x < 10 && threadIdx.y < 10)
+            {
+                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
+                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
+
+                srcx = ::abs(srcx);
+                srcx = ::min(src.cols - 1, srcx);
+
+                srcy = ::abs(srcy);
+                srcy = ::min(src.rows - 1, srcy);
+
+                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
+            }
+
+            __syncthreads();
+
+            sum_t sum = VecTraits<sum_t>::all(0);
+
+            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
+            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
+            const bool eveny = ((threadIdx.y & 1) == 0);
+            const int tidx = threadIdx.x;
+
+            if (eveny)
+            {
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
+                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
+            }
+
+            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
+
+            if (threadIdx.y < 2)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
+            }
+
+            if (threadIdx.y > 13)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
+            }
+
+            __syncthreads();
+
+            sum = VecTraits<sum_t>::all(0);
+
+            const int tidy = threadIdx.y;
+
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
+            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
+
+            if (x < dst.cols && y < dst.rows)
+                dst(y, x) = saturate_cast<T>(4.0f * sum);
+        }
+
+        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            pyrUp<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudawarping/src/cuda/remap.cu
+++ b/modules/cudawarping/src/cuda/remap.cu
@@ -0,0 +1,274 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = mapx.ptr(y)[x];
+                const float ycoo = mapy.ptr(y)[x];
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
+            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_remap_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float*, bool) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
+            {
+                if (stream == 0)
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
+                else
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
+            }
+        };
+
+        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
+
+            static const caller_t callers[3][5] =
+            {
+                {
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call
+                },
+                {
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call
+                },
+                {
+                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call
+                }
+            };
+
+            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
+        }
+
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudawarping/src/cuda/resize.cu
+++ b/modules/cudawarping/src/cuda/resize.cu
@@ -0,0 +1,482 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <cfloat>
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    // kernels
+
+    template <typename T> __global__ void resize_nearest(const PtrStep<T> src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (dst_x < dst.cols && dst_y < dst.rows)
+        {
+            const float src_x = dst_x * fx;
+            const float src_y = dst_y * fy;
+
+            dst(dst_y, dst_x) = src(__float2int_rz(src_y), __float2int_rz(src_x));
+        }
+    }
+
+    template <typename T> __global__ void resize_linear(const PtrStepSz<T> src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (dst_x < dst.cols && dst_y < dst.rows)
+        {
+            const float src_x = dst_x * fx;
+            const float src_y = dst_y * fy;
+
+            work_type out = VecTraits<work_type>::all(0);
+
+            const int x1 = __float2int_rd(src_x);
+            const int y1 = __float2int_rd(src_y);
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+            const int x2_read = ::min(x2, src.cols - 1);
+            const int y2_read = ::min(y2, src.rows - 1);
+
+            T src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - src_x) * (y2 - src_y));
+
+            src_reg = src(y1, x2_read);
+            out = out + src_reg * ((src_x - x1) * (y2 - src_y));
+
+            src_reg = src(y2_read, x1);
+            out = out + src_reg * ((x2 - src_x) * (src_y - y1));
+
+            src_reg = src(y2_read, x2_read);
+            out = out + src_reg * ((src_x - x1) * (src_y - y1));
+
+            dst(dst_y, dst_x) = saturate_cast<T>(out);
+        }
+    }
+
+    template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (dst_x < dst.cols && dst_y < dst.rows)
+        {
+            const float src_x = dst_x * fx;
+            const float src_y = dst_y * fy;
+
+            dst(dst_y, dst_x) = src(src_y, src_x);
+        }
+    }
+
+    template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, PtrStepSz<T> dst)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < dst.cols && y < dst.rows)
+        {
+            dst(y, x) = src(y, x);
+        }
+    }
+
+    // textures
+
+    template <typename T> struct TextureAccessor;
+
+    #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
+        texture<type, cudaTextureType2D, cudaReadModeElementType> tex_resize_##type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        template <> struct TextureAccessor<type> \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            int xoff; \
+            int yoff; \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            { \
+                return tex2D(tex_resize_##type, x + xoff, y + yoff); \
+            } \
+            __host__ static void bind(const PtrStepSz<type>& mat) \
+            { \
+                bindTexture(&tex_resize_##type, mat); \
+            } \
+        };
+
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+
+    #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
+
+    template <typename T>
+    TextureAccessor<T> texAccessor(const PtrStepSz<T>& mat, int yoff, int xoff)
+    {
+        TextureAccessor<T>::bind(mat);
+
+        TextureAccessor<T> t;
+        t.xoff = xoff;
+        t.yoff = yoff;
+
+        return t;
+    }
+
+    // callers for nearest interpolation
+
+    template <typename T>
+    void call_resize_nearest_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize_nearest<<<grid, block, 0, stream>>>(src, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    void call_resize_nearest_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize<<<grid, block>>>(texAccessor(srcWhole, yoff, xoff), dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    // callers for linear interpolation
+
+    template <typename T>
+    void call_resize_linear_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize_linear<<<grid, block>>>(src, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    void call_resize_linear_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        if (srcWhole.data == src.data)
+        {
+            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
+            LinearFilter< TextureAccessor<T> > filteredSrc(texSrc);
+
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+        else
+        {
+            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
+
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
+            LinearFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    // callers for cubic interpolation
+
+    template <typename T>
+    void call_resize_cubic_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        BrdReplicate<T> brd(src.rows, src.cols);
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+        CubicFilter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+        resize<<<grid, block, 0, stream>>>(filteredSrc, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    void call_resize_cubic_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        if (srcWhole.data == src.data)
+        {
+            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
+            CubicFilter< TextureAccessor<T> > filteredSrc(texSrc);
+
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+        else
+        {
+            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
+
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
+            CubicFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    // ResizeNearestDispatcher
+
+    template <typename T> struct ResizeNearestDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_nearest_glob(src, dst, fy, fx, stream);
+        }
+    };
+
+    template <typename T> struct SelectImplForNearest
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_nearest_glob(src, dst, fy, fx, stream);
+            else
+            {
+                if (fx > 1 || fy > 1)
+                    call_resize_nearest_glob(src, dst, fy, fx, 0);
+                else
+                    call_resize_nearest_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+            }
+        }
+    };
+
+    template <> struct ResizeNearestDispatcher<uchar> : SelectImplForNearest<uchar> {};
+    template <> struct ResizeNearestDispatcher<uchar4> : SelectImplForNearest<uchar4> {};
+
+    template <> struct ResizeNearestDispatcher<ushort> : SelectImplForNearest<ushort> {};
+    template <> struct ResizeNearestDispatcher<ushort4> : SelectImplForNearest<ushort4> {};
+
+    template <> struct ResizeNearestDispatcher<short> : SelectImplForNearest<short> {};
+    template <> struct ResizeNearestDispatcher<short4> : SelectImplForNearest<short4> {};
+
+    template <> struct ResizeNearestDispatcher<float> : SelectImplForNearest<float> {};
+    template <> struct ResizeNearestDispatcher<float4> : SelectImplForNearest<float4> {};
+
+    // ResizeLinearDispatcher
+
+    template <typename T> struct ResizeLinearDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_linear_glob(src, dst, fy, fx, stream);
+        }
+    };
+
+    template <typename T> struct SelectImplForLinear
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_linear_glob(src, dst, fy, fx, stream);
+            else
+            {
+                if (fx > 1 || fy > 1)
+                    call_resize_linear_glob(src, dst, fy, fx, 0);
+                else
+                    call_resize_linear_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+            }
+        }
+    };
+
+    template <> struct ResizeLinearDispatcher<uchar> : SelectImplForLinear<uchar> {};
+    template <> struct ResizeLinearDispatcher<uchar4> : SelectImplForLinear<uchar4> {};
+
+    template <> struct ResizeLinearDispatcher<ushort> : SelectImplForLinear<ushort> {};
+    template <> struct ResizeLinearDispatcher<ushort4> : SelectImplForLinear<ushort4> {};
+
+    template <> struct ResizeLinearDispatcher<short> : SelectImplForLinear<short> {};
+    template <> struct ResizeLinearDispatcher<short4> : SelectImplForLinear<short4> {};
+
+    template <> struct ResizeLinearDispatcher<float> : SelectImplForLinear<float> {};
+    template <> struct ResizeLinearDispatcher<float4> : SelectImplForLinear<float4> {};
+
+    // ResizeCubicDispatcher
+
+    template <typename T> struct ResizeCubicDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            call_resize_cubic_glob(src, dst, fy, fx, stream);
+        }
+    };
+
+    template <typename T> struct SelectImplForCubic
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_cubic_glob(src, dst, fy, fx, stream);
+            else
+                call_resize_cubic_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+        }
+    };
+
+    template <> struct ResizeCubicDispatcher<uchar> : SelectImplForCubic<uchar> {};
+    template <> struct ResizeCubicDispatcher<uchar4> : SelectImplForCubic<uchar4> {};
+
+    template <> struct ResizeCubicDispatcher<ushort> : SelectImplForCubic<ushort> {};
+    template <> struct ResizeCubicDispatcher<ushort4> : SelectImplForCubic<ushort4> {};
+
+    template <> struct ResizeCubicDispatcher<short> : SelectImplForCubic<short> {};
+    template <> struct ResizeCubicDispatcher<short4> : SelectImplForCubic<short4> {};
+
+    template <> struct ResizeCubicDispatcher<float> : SelectImplForCubic<float> {};
+    template <> struct ResizeCubicDispatcher<float4> : SelectImplForCubic<float4> {};
+
+    // ResizeAreaDispatcher
+
+    template <typename T> struct ResizeAreaDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>&, int, int, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            const int iscale_x = (int) round(fx);
+            const int iscale_y = (int) round(fy);
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            if (std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
+            {
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
+            }
+            else
+            {
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    // resize
+
+    template <typename T> void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream)
+    {
+        typedef void (*func_t)(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream);
+        static const func_t funcs[4] =
+        {
+            ResizeNearestDispatcher<T>::call,
+            ResizeLinearDispatcher<T>::call,
+            ResizeCubicDispatcher<T>::call,
+            ResizeAreaDispatcher<T>::call
+        };
+
+        // change to linear if area interpolation upscaling
+        if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
+            interpolation = 1;
+
+        funcs[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
+    }
+
+    template void resize<uchar >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<uchar3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<uchar4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+    template void resize<ushort >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<ushort3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<ushort4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+    template void resize<short >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<short3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<short4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+    template void resize<float >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<float3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<float4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudawarping/src/cuda/warp.cu
+++ b/modules/cudawarping/src/cuda/warp.cu
@@ -0,0 +1,389 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        __constant__ float c_warpMat[3 * 3];
+
+        struct AffineTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
+                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        struct PerspectiveTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
+
+                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
+                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////
+        // Build Maps
+
+        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < xmap.cols && y < xmap.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                xmap(y, x) = coord.x;
+                ymap(y, x) = coord.y;
+            }
+        }
+
+        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
+
+            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
+        }
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////////
+        // Warp
+
+        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
+            }
+        }
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
+            {
+                (void)xoff;
+                (void)yoff;
+                (void)srcWhole;
+
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
+            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_warp_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
+            {
+                if (stream == 0)
+                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
+                else
+                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
+            }
+        };
+
+        template <class Transform, typename T>
+        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
+                         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
+
+            static const func_t funcs[3][5] =
+            {
+                {
+                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call
+                }
+            };
+
+            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
+        }
+
+        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
+        }
+
+        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
+        }
+
+        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudawarping/src/precomp.cpp
+++ b/modules/cudawarping/src/precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
--- a/modules/cudawarping/src/precomp.hpp
+++ b/modules/cudawarping/src/precomp.hpp
@@ -0,0 +1,57 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include "opencv2/cudawarping.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy.hpp"
+#  include "opencv2/cudalegacy/private.hpp"
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cudawarping/src/pyramids.cpp
+++ b/modules/cudawarping/src/pyramids.cpp
@@ -0,0 +1,244 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::pyrDown(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::pyrUp(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray, int, Stream&) { throw_no_cuda(); return Ptr<ImagePyramid>(); }
+
+#else // HAVE_CUDA
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrDown
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::pyrDown(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
+        {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
+        {pyrDown_gpu<ushort>     , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3>     , pyrDown_gpu<ushort4>     },
+        {pyrDown_gpu<short>      , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3>      , pyrDown_gpu<short4>      },
+        {0 /*pyrDown_gpu<int>*/  , 0 /*pyrDown_gpu<int2>*/   , 0 /*pyrDown_gpu<int3>*/  , 0 /*pyrDown_gpu<int4>*/  },
+        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert( func != 0 );
+
+    _dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    func(src, dst, StreamAccessor::getStream(stream));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrUp
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::pyrUp(InputArray _src, OutputArray _dst, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
+        {0 /*pyrUp_gpu<schar>*/, 0 /*pyrUp_gpu<schar2>*/ , 0 /*pyrUp_gpu<schar3>*/, 0 /*pyrUp_gpu<schar4>*/},
+        {pyrUp_gpu<ushort>     , 0 /*pyrUp_gpu<ushort2>*/, pyrUp_gpu<ushort3>     , pyrUp_gpu<ushort4>     },
+        {pyrUp_gpu<short>      , 0 /*pyrUp_gpu<short2>*/ , pyrUp_gpu<short3>      , pyrUp_gpu<short4>      },
+        {0 /*pyrUp_gpu<int>*/  , 0 /*pyrUp_gpu<int2>*/   , 0 /*pyrUp_gpu<int3>*/  , 0 /*pyrUp_gpu<int4>*/  },
+        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert( func != 0 );
+
+    _dst.create(src.rows * 2, src.cols * 2, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    func(src, dst, StreamAccessor::getStream(stream));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ImagePyramid
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+
+namespace
+{
+    class ImagePyramidImpl : public ImagePyramid
+    {
+    public:
+        ImagePyramidImpl(InputArray img, int nLayers, Stream& stream);
+
+        void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const;
+
+    private:
+        GpuMat layer0_;
+        std::vector<GpuMat> pyramid_;
+        int nLayers_;
+    };
+
+    ImagePyramidImpl::ImagePyramidImpl(InputArray _img, int numLayers, Stream& stream)
+    {
+        GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.depth() <= CV_32F && img.channels() <= 4 );
+
+        img.copyTo(layer0_, stream);
+
+        Size szLastLayer = img.size();
+        nLayers_ = 1;
+
+        if (numLayers <= 0)
+            numLayers = 255; // it will cut-off when any of the dimensions goes 1
+
+        pyramid_.resize(numLayers);
+
+        for (int i = 0; i < numLayers - 1; ++i)
+        {
+            Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+
+            if (szCurLayer.width == 0 || szCurLayer.height == 0)
+                break;
+
+            ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
+            nLayers_++;
+
+            const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
+
+            cv::cuda::device::pyramid::downsampleX2(prevLayer, pyramid_[i], img.depth(), img.channels(), StreamAccessor::getStream(stream));
+
+            szLastLayer = szCurLayer;
+        }
+    }
+
+    void ImagePyramidImpl::getLayer(OutputArray _outImg, Size outRoi, Stream& stream) const
+    {
+        CV_Assert( outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0 );
+
+        ensureSizeIsEnough(outRoi, layer0_.type(), _outImg);
+        GpuMat outImg = _outImg.getGpuMat();
+
+        if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
+        {
+            layer0_.copyTo(outImg, stream);
+            return;
+        }
+
+        float lastScale = 1.0f;
+        float curScale;
+        GpuMat lastLayer = layer0_;
+        GpuMat curLayer;
+
+        for (int i = 0; i < nLayers_ - 1; ++i)
+        {
+            curScale = lastScale * 0.5f;
+            curLayer = pyramid_[i];
+
+            if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
+            {
+                curLayer.copyTo(outImg, stream);
+            }
+
+            if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
+                break;
+
+            lastScale = curScale;
+            lastLayer = curLayer;
+        }
+
+        cv::cuda::device::pyramid::interpolateFrom1(lastLayer, outImg, outImg.depth(), outImg.channels(), StreamAccessor::getStream(stream));
+    }
+}
+
+#endif
+
+Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray img, int nLayers, Stream& stream)
+{
+#ifndef HAVE_OPENCV_CUDALEGACY
+    (void) img;
+    (void) numLayers;
+    (void) stream;
+    throw_no_cuda();
+    return Ptr<ImagePyramid>();
+#else
+    return new ImagePyramidImpl(img, nLayers, stream);
+#endif
+}
+
+#endif // HAVE_CUDA
--- a/modules/cudawarping/src/remap.cpp
+++ b/modules/cudawarping/src/remap.cpp
@@ -0,0 +1,104 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::remap(InputArray, OutputArray, InputArray, InputArray, int, int, Scalar, Stream&){ throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T>
+        void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst,
+                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    }
+}}}
+
+void cv::cuda::remap(InputArray _src, OutputArray _dst, InputArray _xmap, InputArray _ymap, int interpolation, int borderMode, Scalar borderValue, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
+        int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    static const func_t funcs[6][4] =
+    {
+        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
+        {0 /*remap_gpu<schar>*/, 0 /*remap_gpu<char2>*/  , 0 /*remap_gpu<char3>*/, 0 /*remap_gpu<char4>*/},
+        {remap_gpu<ushort>     , 0 /*remap_gpu<ushort2>*/, remap_gpu<ushort3>    , remap_gpu<ushort4>    },
+        {remap_gpu<short>      , 0 /*remap_gpu<short2>*/ , remap_gpu<short3>     , remap_gpu<short4>     },
+        {0 /*remap_gpu<int>*/  , 0 /*remap_gpu<int2>*/   , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
+        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
+    };
+
+    GpuMat src = _src.getGpuMat();
+    GpuMat xmap = _xmap.getGpuMat();
+    GpuMat ymap = _ymap.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size() );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
+    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    if (!func)
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported input type");
+
+    _dst.create(xmap.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    Scalar_<float> borderValueFloat;
+    borderValueFloat = borderValue;
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap,
+        dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
+}
+
+#endif // HAVE_CUDA
--- a/modules/cudawarping/src/resize.cpp
+++ b/modules/cudawarping/src/resize.cpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::resize(InputArray, OutputArray, Size, double, double, int, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T>
+    void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+}}}
+
+void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, double fy, int interpolation, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    typedef void (*func_t)(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
+        {0 /*device::resize<schar>*/, 0 /*device::resize<char2>*/  , 0 /*device::resize<char3>*/, 0 /*device::resize<char4>*/},
+        {device::resize<ushort>     , 0 /*device::resize<ushort2>*/, device::resize<ushort3>    , device::resize<ushort4>    },
+        {device::resize<short>      , 0 /*device::resize<short2>*/ , device::resize<short3>     , device::resize<short4>     },
+        {0 /*device::resize<int>*/  , 0 /*device::resize<int2>*/   , 0 /*device::resize<int3>*/ , 0 /*device::resize<int4>*/ },
+        {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
+    };
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA );
+    CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
+
+    if (dsize == Size())
+    {
+        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
+    }
+    else
+    {
+        fx = static_cast<double>(dsize.width) / src.cols;
+        fy = static_cast<double>(dsize.height) / src.rows;
+    }
+
+    _dst.create(dsize, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    if (dsize == src.size())
+    {
+        src.copyTo(dst, stream);
+        return;
+    }
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+
+    if (!func)
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+    PtrStepSzb wholeSrc(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+    func(src, wholeSrc, ofs.y, ofs.x, dst, static_cast<float>(1.0 / fy), static_cast<float>(1.0 / fx), interpolation, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA
--- a/modules/cudawarping/src/warp.cpp
+++ b/modules/cudawarping/src/warp.cpp
@@ -0,0 +1,656 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::warpAffine(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
+void cv::cuda::buildWarpAffineMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::warpPerspective(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
+void cv::cuda::buildWarpPerspectiveMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::buildWarpPlaneMaps(Size, Rect, InputArray, InputArray, InputArray, float, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::buildWarpCylindricalMaps(Size, Rect, InputArray, InputArray, float, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::buildWarpSphericalMaps(Size, Rect, InputArray, InputArray, float, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::rotate(InputArray, OutputArray, Size, double, double, double, int, Stream&) { throw_no_cuda(); }
+
+#else // HAVE_CUDA
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
+
+        template <typename T>
+        void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
+
+        template <typename T>
+        void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    }
+}}}
+
+void cv::cuda::buildWarpAffineMaps(InputArray _M, bool inverse, Size dsize, OutputArray _xmap, OutputArray _ymap, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    Mat M = _M.getMat();
+
+    CV_Assert( M.rows == 2 && M.cols == 3 );
+
+    _xmap.create(dsize, CV_32FC1);
+    _ymap.create(dsize, CV_32FC1);
+
+    GpuMat xmap = _xmap.getGpuMat();
+    GpuMat ymap = _ymap.getGpuMat();
+
+    float coeffs[2 * 3];
+    Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
+
+    if (inverse)
+        M.convertTo(coeffsMat, coeffsMat.type());
+    else
+    {
+        cv::Mat iM;
+        invertAffineTransform(M, iM);
+        iM.convertTo(coeffsMat, coeffsMat.type());
+    }
+
+    buildWarpAffineMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
+}
+
+void cv::cuda::buildWarpPerspectiveMaps(InputArray _M, bool inverse, Size dsize, OutputArray _xmap, OutputArray _ymap, Stream& stream)
+{
+    using namespace cv::cuda::device::imgproc;
+
+    Mat M = _M.getMat();
+
+    CV_Assert( M.rows == 3 && M.cols == 3 );
+
+    _xmap.create(dsize, CV_32FC1);
+    _ymap.create(dsize, CV_32FC1);
+
+    GpuMat xmap = _xmap.getGpuMat();
+    GpuMat ymap = _ymap.getGpuMat();
+
+    float coeffs[3 * 3];
+    Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
+
+    if (inverse)
+        M.convertTo(coeffsMat, coeffsMat.type());
+    else
+    {
+        cv::Mat iM;
+        invert(M, iM);
+        iM.convertTo(coeffsMat, coeffsMat.type());
+    }
+
+    buildWarpPerspectiveMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    template <int DEPTH> struct NppWarpFunc
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_type* pDst,
+                                    int dstStep, NppiRect dstRoi, const double coeffs[][3],
+                                    int interpolation);
+    };
+
+    template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp
+    {
+        typedef typename NppWarpFunc<DEPTH>::npp_type npp_type;
+
+        static void call(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
+        {
+            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+
+            NppiSize srcsz;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;
+
+            NppiRect srcroi;
+            srcroi.x = 0;
+            srcroi.y = 0;
+            srcroi.height = src.rows;
+            srcroi.width = src.cols;
+
+            NppiRect dstroi;
+            dstroi.x = 0;
+            dstroi.y = 0;
+            dstroi.height = dst.rows;
+            dstroi.width = dst.cols;
+
+            cv::cuda::NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
+                              dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi,
+                              coeffs, npp_inter[interpolation]) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    Mat M = _M.getMat();
+
+    CV_Assert( M.rows == 2 && M.cols == 3 );
+
+    const int interpolation = flags & INTER_MAX;
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
+    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
+    _dst.create(dsize, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    static const bool useNppTab[6][4][3] =
+    {
+        {
+            {false, false, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        }
+    };
+
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;
+
+    if (useNpp)
+    {
+        typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
+
+        static const func_t funcs[2][6][4] =
+        {
+            {
+                {NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffine_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffine_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpAffine_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call}
+            }
+        };
+
+        dst.setTo(borderValue, stream);
+
+        double coeffs[2][3];
+        Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
+        M.convertTo(coeffsMat, coeffsMat.type());
+
+        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(stream));
+    }
+    else
+    {
+        using namespace cv::cuda::device::imgproc;
+
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        static const func_t funcs[6][4] =
+        {
+            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
+            {0 /*warpAffine_gpu<schar>*/, 0 /*warpAffine_gpu<char2>*/  , 0 /*warpAffine_gpu<char3>*/, 0 /*warpAffine_gpu<char4>*/},
+            {warpAffine_gpu<ushort>     , 0 /*warpAffine_gpu<ushort2>*/, warpAffine_gpu<ushort3>    , warpAffine_gpu<ushort4>    },
+            {warpAffine_gpu<short>      , 0 /*warpAffine_gpu<short2>*/ , warpAffine_gpu<short3>     , warpAffine_gpu<short4>     },
+            {0 /*warpAffine_gpu<int>*/  , 0 /*warpAffine_gpu<int2>*/   , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
+            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        float coeffs[2 * 3];
+        Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
+
+        if (flags & WARP_INVERSE_MAP)
+            M.convertTo(coeffsMat, coeffsMat.type());
+        else
+        {
+            cv::Mat iM;
+            invertAffineTransform(M, iM);
+            iM.convertTo(coeffsMat, coeffsMat.type());
+        }
+
+        Scalar_<float> borderValueFloat;
+        borderValueFloat = borderValue;
+
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+            dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
+    }
+}
+
+void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    Mat M = _M.getMat();
+
+    CV_Assert( M.rows == 3 && M.cols == 3 );
+
+    const int interpolation = flags & INTER_MAX;
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
+    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP) ;
+
+    _dst.create(dsize, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    Size wholeSize;
+    Point ofs;
+    src.locateROI(wholeSize, ofs);
+
+    static const bool useNppTab[6][4][3] =
+    {
+        {
+            {false, false, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, false}
+        },
+        {
+            {false, false, false},
+            {false, false, false},
+            {false, false, false},
+            {false, false, false}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        },
+        {
+            {false, true, true},
+            {false, false, false},
+            {false, true, true},
+            {false, false, true}
+        }
+    };
+
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;
+
+    if (useNpp)
+    {
+        typedef void (*func_t)(const cv::cuda::GpuMat& src, cv::cuda::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
+
+        static const func_t funcs[2][6][4] =
+        {
+            {
+                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
+            },
+            {
+                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
+                {0, 0, 0, 0},
+                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
+                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
+            }
+        };
+
+        dst.setTo(borderValue, stream);
+
+        double coeffs[3][3];
+        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
+        M.convertTo(coeffsMat, coeffsMat.type());
+
+        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(stream));
+    }
+    else
+    {
+        using namespace cv::cuda::device::imgproc;
+
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        static const func_t funcs[6][4] =
+        {
+            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
+            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
+            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
+            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
+            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
+            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
+        };
+
+        const func_t func = funcs[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        float coeffs[3 * 3];
+        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
+
+        if (flags & WARP_INVERSE_MAP)
+            M.convertTo(coeffsMat, coeffsMat.type());
+        else
+        {
+            cv::Mat iM;
+            invert(M, iM);
+            iM.convertTo(coeffsMat, coeffsMat.type());
+        }
+
+        Scalar_<float> borderValueFloat;
+        borderValueFloat = borderValue;
+
+        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
+            dst, interpolation, borderMode, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// buildWarpPlaneMaps
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
+                                cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::buildWarpPlaneMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, InputArray _T,
+                                 float scale, OutputArray _map_x, OutputArray _map_y, Stream& stream)
+{
+    (void) src_size;
+
+    Mat K = _K.getMat();
+    Mat R = _R.getMat();
+    Mat T = _T.getMat();
+
+    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
+    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
+    CV_Assert( (T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32FC1 && T.isContinuous() );
+
+    Mat K_Rinv = K * R.t();
+    Mat R_Kinv = R * K.inv();
+    CV_Assert( K_Rinv.isContinuous() );
+    CV_Assert( R_Kinv.isContinuous() );
+
+    _map_x.create(dst_roi.size(), CV_32FC1);
+    _map_y.create(dst_roi.size(), CV_32FC1);
+
+    GpuMat map_x = _map_x.getGpuMat();
+    GpuMat map_y = _map_y.getGpuMat();
+
+    device::imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
+                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// buildWarpCylyndricalMaps
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                      const float k_rinv[9], const float r_kinv[9], float scale,
+                                      cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, float scale,
+                                       OutputArray _map_x, OutputArray _map_y, Stream& stream)
+{
+    (void) src_size;
+
+    Mat K = _K.getMat();
+    Mat R = _R.getMat();
+
+    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
+    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
+
+    Mat K_Rinv = K * R.t();
+    Mat R_Kinv = R * K.inv();
+    CV_Assert( K_Rinv.isContinuous() );
+    CV_Assert( R_Kinv.isContinuous() );
+
+    _map_x.create(dst_roi.size(), CV_32FC1);
+    _map_y.create(dst_roi.size(), CV_32FC1);
+
+    GpuMat map_x = _map_x.getGpuMat();
+    GpuMat map_y = _map_y.getGpuMat();
+
+    device::imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// buildWarpSphericalMaps
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
+                                    const float k_rinv[9], const float r_kinv[9], float scale,
+                                    cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::buildWarpSphericalMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, float scale,
+                                     OutputArray _map_x, OutputArray _map_y, Stream& stream)
+{
+    (void) src_size;
+
+    Mat K = _K.getMat();
+    Mat R = _R.getMat();
+
+    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
+    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
+
+    Mat K_Rinv = K * R.t();
+    Mat R_Kinv = R * K.inv();
+    CV_Assert( K_Rinv.isContinuous() );
+    CV_Assert( R_Kinv.isContinuous() );
+
+    _map_x.create(dst_roi.size(), CV_32FC1);
+    _map_y.create(dst_roi.size(), CV_32FC1);
+
+    GpuMat map_x = _map_x.getGpuMat();
+    GpuMat map_y = _map_y.getGpuMat();
+
+    device::imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// rotate
+
+namespace
+{
+    template <int DEPTH> struct NppRotateFunc
+    {
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        typedef NppStatus (*func_t)(const npp_type* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI,
+                                    npp_type* pDst, int nDstStep, NppiRect oDstROI,
+                                    double nAngle, double nShiftX, double nShiftY, int eInterpolation);
+    };
+
+    template <int DEPTH, typename NppRotateFunc<DEPTH>::func_t func> struct NppRotate
+    {
+        typedef typename NppRotateFunc<DEPTH>::npp_type npp_type;
+
+        static void call(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream)
+        {
+            (void)dsize;
+            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+
+            NppStreamHandler h(stream);
+
+            NppiSize srcsz;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;
+            NppiRect srcroi;
+            srcroi.x = srcroi.y = 0;
+            srcroi.height = src.rows;
+            srcroi.width = src.cols;
+            NppiRect dstroi;
+            dstroi.x = dstroi.y = 0;
+            dstroi.height = dst.rows;
+            dstroi.width = dst.cols;
+
+            nppSafeCall( func(src.ptr<npp_type>(), srcsz, static_cast<int>(src.step), srcroi,
+                dst.ptr<npp_type>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::rotate(InputArray _src, OutputArray _dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
+        {0,0,0,0},
+        {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call},
+        {0,0,0,0},
+        {0,0,0,0},
+        {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call}
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F );
+    CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
+
+    _dst.create(dsize, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    dst.setTo(Scalar::all(0), stream);
+
+    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
+}
+
+#endif // HAVE_CUDA