GPU: updated upsample, downsample functions, added pyrDown, pyrUp, added support of 16S filtering; put spherical warper on GPU (from opencv_stitching)

2011-06-30 14:39:48 +00:00
parent a44d6aacc8
commit 674b763395
19 changed files with 697 additions and 378 deletions
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -647,4 +647,26 @@ namespace cv { namespace gpu { namespace mathfunc
    template void threshold_gpu<int>(const DevMem2D& src, const DevMem2D& dst, int thresh, int maxVal, int type, cudaStream_t stream);
    template void threshold_gpu<float>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
    template void threshold_gpu<double>(const DevMem2D& src, const DevMem2D& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // subtract
+
+    template <typename T>
+    class SubtractOp
+    {
+    public:
+        __device__ __forceinline__ T operator()(const T& l, const T& r) const
+        {
+            return l - r;
+        }
+    };
+
+    template <typename T>
+    void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
+    {
+        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, SubtractOp<T>(), stream);
+    }
+
+    template void subtractCaller<short>(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
 }}}
--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -224,6 +224,7 @@ namespace cv { namespace gpu { namespace filters
    template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
    template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
    template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearRowFilter_gpu<short3, float3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
    template void linearRowFilter_gpu<int   , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
    template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
 }}}
@@ -275,7 +276,7 @@ namespace cv { namespace gpu { namespace filters
        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));

-        B<T> b(src.rows, src.step / src.elemSize());
+        B<T> b(src.rows, src.step);

        if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
        {
@@ -364,6 +365,7 @@ namespace cv { namespace gpu { namespace filters
    template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
    template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
    template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
    template void linearColumnFilter_gpu<float , int   >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
    template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
 }}}
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -42,14 +42,6 @@

 #include "internal_shared.hpp"

-#ifndef CV_PI_F
-  #ifndef CV_PI
-    #define CV_PI_F 3.14159265f
-  #else
-    #define CV_PI_F ((float)CV_PI)
-  #endif
-#endif
-
 // Other values are not supported
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
@@ -776,4 +768,4 @@ static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
 void resize_8UC1(const DevMem2D& src, DevMem2D dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
 void resize_8UC4(const DevMem2D& src, DevMem2D dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }

-}}}
+}}}
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -66,8 +66,8 @@ namespace cv { namespace gpu { namespace imgproc
        }
    }

-    __global__ void remap_3c(const uchar* src, size_t src_step, const float* mapx, const float* mapy, size_t map_step, 
-                             uchar* dst, size_t dst_step, int width, int height)
+    __global__ void remap_3c(const uchar* src, size_t src_step, const float* mapx, const float* mapy,
+                             size_t map_step, uchar* dst, size_t dst_step, int width, int height)
    {    
        const int x = blockDim.x * blockIdx.x + threadIdx.x;
        const int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -131,7 +131,7 @@ namespace cv { namespace gpu { namespace imgproc
        grid.x = divUp(dst.cols, threads.x);
        grid.y = divUp(dst.rows, threads.y);

-        tex_remap.filterMode = cudaFilterModeLinear;	    
+        tex_remap.filterMode = cudaFilterModeLinear;
        tex_remap.addressMode[0] = tex_remap.addressMode[1] = cudaAddressModeWrap;
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
        cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
@@ -139,7 +139,7 @@ namespace cv { namespace gpu { namespace imgproc
        remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );  
+        cudaSafeCall( cudaDeviceSynchronize() );
        cudaSafeCall( cudaUnbindTexture(tex_remap) );
    }
    
@@ -151,9 +151,9 @@ namespace cv { namespace gpu { namespace imgproc
        grid.y = divUp(dst.rows, threads.y);

        remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
-        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() ); 
+        cudaSafeCall( cudaGetLastError() );
+        cudaSafeCall( cudaDeviceSynchronize() );
    }

 /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
@@ -768,6 +768,7 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaDeviceSynchronize() );
    }

+
    //////////////////////////////////////////////////////////////////////////
    // mulSpectrums

@@ -796,6 +797,7 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaDeviceSynchronize() );
    }

+
    //////////////////////////////////////////////////////////////////////////
    // mulSpectrums_CONJ

@@ -825,6 +827,7 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaDeviceSynchronize() );
    }

+
    //////////////////////////////////////////////////////////////////////////
    // mulAndScaleSpectrums

@@ -855,6 +858,7 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaDeviceSynchronize() );
    }

+
    //////////////////////////////////////////////////////////////////////////
    // mulAndScaleSpectrums_CONJ

@@ -885,34 +889,173 @@ namespace cv { namespace gpu { namespace imgproc
        cudaSafeCall( cudaDeviceSynchronize() );
    }

+
    /////////////////////////////////////////////////////////////////////////
    // downsample

-    template <typename T>
-    __global__ void downsampleKernel(const PtrStep_<T> src, int rows, int cols, int k, PtrStep_<T> dst)
+    template <typename T, int cn>
+    __global__ void downsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
    {
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int y = blockIdx.y * blockDim.y + threadIdx.y;

-        if (x < cols && y < rows)
-            dst.ptr(y)[x] = src.ptr(y * k)[x * k];
+        if (x < dst.cols && y < dst.rows)
+        {
+            int ch_x = x / cn;
+            dst.ptr(y)[x] = src.ptr(y*2)[ch_x*2*cn + x - ch_x*cn];
+        }
    }


-    template <typename T>
-    void downsampleCaller(const PtrStep_<T> src, int rows, int cols, int k, PtrStep_<T> dst)
+    template <typename T, int cn>
+    void downsampleCaller(const DevMem2D src, DevMem2D dst)
    {
-        dim3 threads(16, 16);
+        dim3 threads(32, 8);
+        dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
+
+        downsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
+        cudaSafeCall(cudaGetLastError());
+        cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // upsample
+
+    template <typename T, int cn>
+    __global__ void upsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
+    {
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x < dst.cols && y < dst.rows)
+        {
+            int ch_x = x / cn;
+            T val = ((ch_x & 1) || (y & 1)) ? 0 : src.ptr(y/2)[ch_x/2*cn + x - ch_x*cn];
+            dst.ptr(y)[x] = val;
+        }
+    }
+
+
+    template <typename T, int cn>
+    void upsampleCaller(const DevMem2D src, DevMem2D dst)
+    {
+        dim3 threads(32, 8);
+        dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
+
+        upsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
+        cudaSafeCall(cudaGetLastError());
+        cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // buildWarpMaps
+
+    namespace build_warp_maps
+    {
+        __constant__ float cr[9];
+        __constant__ float crinv[9];
+        __constant__ float cf, cs;
+        __constant__ float chalf_w, chalf_h;
+    }
+
+
+    class SphericalMapper
+    {
+    public:
+        static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
+        {
+            using namespace build_warp_maps;
+
+            v /= cs;
+            u /= cs;
+
+            float sinv = sinf(v);
+            float x_ = sinv * sinf(u);
+            float y_ = -cosf(v);
+            float z_ = sinv * cosf(u);
+
+            float z;
+            x = crinv[0]*x_ + crinv[1]*y_ + crinv[2]*z_;
+            y = crinv[3]*x_ + crinv[4]*y_ + crinv[5]*z_;
+            z = crinv[6]*x_ + crinv[7]*y_ + crinv[8]*z_;
+
+            x = cf*x/z + chalf_w;
+            y = cf*y/z + chalf_h;
+        }
+    };
+
+
+    template <typename Mapper>
+    __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
+                                        PtrStepf map_x, PtrStepf map_y)
+    {
+        int du = blockIdx.x * blockDim.x + threadIdx.x;
+        int dv = blockIdx.y * blockDim.y + threadIdx.y;
+        if (du < cols && dv < rows)
+        {
+            float u = tl_u + du;
+            float v = tl_v + dv;
+            float x, y;
+            Mapper::mapBackward(u, v, x, y);
+            map_x.ptr(dv)[du] = x;
+            map_y.ptr(dv)[du] = y;
+        }
+    }
+
+
+    void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+                                const float r[9], const float rinv[9], float f, float s,
+                                float half_w, float half_h, cudaStream_t stream)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr, r, 9*sizeof(float)));
+        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::crinv, rinv, 9*sizeof(float)));
+        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cf, &f, sizeof(float)));
+        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cs, &s, sizeof(float)));
+        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::chalf_w, &half_w, sizeof(float)));
+        cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::chalf_h, &half_h, sizeof(float)));
+
+        int cols = map_x.cols;
+        int rows = map_x.rows;
+
+        dim3 threads(32, 8);
        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));

-        downsampleKernel<<<grid, threads>>>(src, rows, cols, k, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
+        buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
+        cudaSafeCall(cudaGetLastError());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
    }

-    template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst);
-    template void downsampleCaller(const PtrStepf src, int rows, int cols, int k, PtrStepf dst);

 }}}

--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -49,6 +49,14 @@
 #include "npp.h"
 #include "NPP_staging.hpp"

+#ifndef CV_PI_F
+  #ifndef CV_PI
+    #define CV_PI_F 3.14159265f
+  #else
+    #define CV_PI_F ((float)CV_PI)
+  #endif
+#endif
+
 namespace cv
 {
    namespace gpu
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -174,9 +174,22 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
    nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, StreamAccessor::getStream(stream));
 }

+namespace cv { namespace gpu { namespace mathfunc
+{
+    template <typename T>
+    void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
+}}}
+
 void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
 {
-    nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream));
+    if (src1.depth() == CV_16S && src2.depth() == CV_16S)
+    {
+        CV_Assert(src1.size() == src2.size());
+        dst.create(src1.size(), src1.type());
+        mathfunc::subtractCaller<short>(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    }
+    else
+        nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream));
 }

 void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
@@ -755,4 +768,4 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
    return thresh;
 }

-#endif
+#endif
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -192,7 +192,8 @@ namespace
            Size src_size = src.size();

            dst.create(src_size, dstType);
-            dstBuf.create(src_size, bufType);
+            ensureSizeIsEnough(src_size, bufType, dstBuf);
+            //dstBuf.create(src_size, bufType);

            if (stream)
            {
@@ -717,7 +718,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC1 || srcType == CV_16SC2 
-        || srcType == CV_32SC1 || srcType == CV_32FC1);
+        || srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1);

    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));

@@ -747,6 +748,9 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
    case CV_16SC2:
        func = filters::linearRowFilter_gpu<short2, float2>;
        break;
+    case CV_16SC3:
+        func = filters::linearRowFilter_gpu<short3, float3>;
+        break;
    case CV_32SC1:
        func = filters::linearRowFilter_gpu<int, float>;
        break;
@@ -827,8 +831,8 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
   
-    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2 
-        || dstType == CV_32SC1 || dstType == CV_32FC1);
+    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2
+        || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1);

    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));

@@ -858,6 +862,9 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    case CV_16SC2:
        func = filters::linearColumnFilter_gpu<float2, short2>;
        break;
+    case CV_16SC3:
+        func = filters::linearColumnFilter_gpu<float3, short3>;
+        break;
    case CV_32SC1:
        func = filters::linearColumnFilter_gpu<float, int>;
        break;
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -56,6 +56,8 @@ void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&)
 void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, const Scalar&, Stream&) { throw_nogpu(); }
 void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int, Stream&) { throw_nogpu(); }
 void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int, Stream&) { throw_nogpu(); }
+void cv::gpu::buildWarpSphericalMaps(Size, Rect, const Mat&, double, double,
+                                     GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_nogpu(); }
 void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
@@ -76,7 +78,11 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
-void cv::gpu::downsample(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
+void cv::gpu::downsample(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::upsample(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&) { throw_nogpu(); }
+


 #else /* !defined (HAVE_CUDA) */
@@ -504,6 +510,30 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
    nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpPerspective_8u, npp_warpPerspective_16u, npp_warpPerspective_32s, npp_warpPerspective_32f, StreamAccessor::getStream(s));
 }

+
+//////////////////////////////////////////////////////////////////////////////
+// buildWarpSphericalMaps
+
+namespace cv { namespace gpu { namespace imgproc
+{
+    void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
+                                const float r[9], const float rinv[9], float f, float s,
+                                float half_w, float half_h, cudaStream_t stream);
+}}}
+
+void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat& R, double f, double s,
+                                     GpuMat& map_x, GpuMat& map_y, Stream& stream)
+{
+    CV_Assert(R.size() == Size(3,3) && R.isContinuous() && R.type() == CV_32F);
+    Mat Rinv = R.inv();
+    CV_Assert(Rinv.isContinuous());
+
+    map_x.create(dst_roi.size(), CV_32F);
+    map_y.create(dst_roi.size(), CV_32F);
+    imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, R.ptr<float>(), Rinv.ptr<float>(),
+                                    f, s, 0.5f*src_size.width, 0.5f*src_size.height, StreamAccessor::getStream(stream));
+}
+
 ////////////////////////////////////////////////////////////////////////
 // rotate

@@ -1333,32 +1363,96 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
    cufftSafeCall(cufftDestroy(planC2R));
 }

+
 ////////////////////////////////////////////////////////////////////
 // downsample

 namespace cv { namespace gpu { namespace imgproc
 {
-    template <typename T>
-    void downsampleCaller(const PtrStep_<T> src, int rows, int cols, int k, PtrStep_<T> dst);
+    template <typename T, int cn>
+    void downsampleCaller(const DevMem2D src, DevMem2D dst);
 }}}

-void cv::gpu::downsample(const GpuMat& src, GpuMat& dst, int k)
+
+void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
 {
-    CV_Assert(src.channels() == 1);    
+    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);

-    dst.create((src.rows + k - 1) / k, (src.cols + k - 1) / k, src.type());
+    typedef void (*Caller)(const DevMem2D, DevMem2D);
+    static const Caller callers[6][4] =
+        {{imgproc::downsampleCaller<uchar,1>, imgproc::downsampleCaller<uchar,2>,
+          imgproc::downsampleCaller<uchar,3>, imgproc::downsampleCaller<uchar,4>},
+         {0,0,0,0}, {0,0,0,0},
+         {imgproc::downsampleCaller<short,1>, imgproc::downsampleCaller<short,2>,
+          imgproc::downsampleCaller<short,3>, imgproc::downsampleCaller<short,4>},
+         {0,0,0,0},
+         {imgproc::downsampleCaller<float,1>, imgproc::downsampleCaller<float,2>,
+          imgproc::downsampleCaller<float,3>, imgproc::downsampleCaller<float,4>}};

-    switch (src.depth())
-    {
-    case CV_8U:
-        imgproc::downsampleCaller<uchar>(src, dst.rows, dst.cols, k, dst);
-        break;
-    case CV_32F:
-        imgproc::downsampleCaller<float>(src, dst.rows, dst.cols, k, dst);
-        break;
-    default:
-        CV_Error(CV_StsUnsupportedFormat, "bad image depth in downsample function");
-    }
+    Caller caller = callers[src.depth()][src.channels()-1];
+    if (!caller)
+        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
+
+    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
+    caller(src, dst.reshape(1));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// upsample
+
+namespace cv { namespace gpu { namespace imgproc
+{
+    template <typename T, int cn>
+    void upsampleCaller(const DevMem2D src, DevMem2D dst);
+}}}
+
+
+void cv::gpu::upsample(const GpuMat& src, GpuMat& dst)
+{
+    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
+
+    typedef void (*Caller)(const DevMem2D, DevMem2D);
+    static const Caller callers[6][5] =
+        {{imgproc::upsampleCaller<uchar,1>, imgproc::upsampleCaller<uchar,2>,
+          imgproc::upsampleCaller<uchar,3>, imgproc::upsampleCaller<uchar,4>},
+         {0,0,0,0}, {0,0,0,0},
+         {imgproc::upsampleCaller<short,1>, imgproc::upsampleCaller<short,2>,
+          imgproc::upsampleCaller<short,3>, imgproc::upsampleCaller<short,4>},
+         {0,0,0,0},
+         {imgproc::upsampleCaller<float,1>, imgproc::upsampleCaller<float,2>,
+          imgproc::upsampleCaller<float,3>, imgproc::upsampleCaller<float,4>}};
+
+    Caller caller = callers[src.depth()][src.channels()-1];
+    if (!caller)
+        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
+
+    dst.create(src.rows*2, src.cols*2, src.type());
+    caller(src, dst.reshape(1));
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrDown
+
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst)
+{
+    Mat ker = getGaussianKernel(5, 0, std::max(CV_32F, src.depth()));
+    GpuMat buf;
+    sepFilter2D(src, buf, src.depth(), ker, ker);
+    downsample(buf, dst);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// pyrUp
+
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst)
+{
+    GpuMat buf;
+    upsample(src, buf);
+    Mat ker = getGaussianKernel(5, 0, std::max(CV_32F, src.depth())) * 2;
+    sepFilter2D(buf, dst, buf.depth(), ker, ker);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -594,8 +594,9 @@ void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m)
 void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
 {
    if (m.type() == type && m.rows >= rows && m.cols >= cols)
-        return;
-    m.create(rows, cols, type);
+        m = m(Rect(0, 0, cols, rows));
+    else
+        m.create(rows, cols, type);
 }


--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -104,13 +104,13 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
-            return saturate_cast<D>(data[idx_low(i) * step]);
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
        }

        template <typename T>
        __device__ __forceinline__ D at_high(int i, const T* data) const 
        {
-            return saturate_cast<D>(data[idx_high(i) * step]);
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
        }

    private:
@@ -174,13 +174,13 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
-            return saturate_cast<D>(data[idx_low(i) * step]);
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
        }

        template <typename T>
        __device__ __forceinline__ D at_high(int i, const T* data) const 
        {
-            return saturate_cast<D>(data[idx_high(i) * step]);
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
        }

    private:
@@ -222,13 +222,13 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
-            return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
+            return i >= 0 ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
        }

        template <typename T>
        __device__ __forceinline__ D at_high(int i, const T* data) const 
        {
-            return i < len ? saturate_cast<D>(data[i * step]) : val;
+            return i < len ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
        }

        bool is_range_safe(int mini, int maxi) const 
@@ -241,6 +241,25 @@ namespace cv { namespace gpu { namespace device
        int step;
        D val;
    };
+
+
+    template <typename OutT>
+    struct BrdConstant
+    {
+        BrdConstant(int w, int h, const OutT &val = VecTraits<OutT>::all(0)) : w(w), h(h), val(val) {}
+
+        __device__ __forceinline__ OutT at(int x, int y, const uchar* data, int step) const
+        {
+            if (x >= 0 && x <= w - 1 && y >= 0 && y <= h - 1)
+                return ((const OutT*)(data + y * step))[x];
+            return val;
+        }
+
+    private:
+        int w, h;
+        OutT val;
+    };
+
 }}}

 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__