added BORDER_REFLECT and BORDER_WRAP support to gpu module

switched to gpu::remap in opencv_stitching
2011-08-31 13:21:14 +00:00
parent a5df21bf24
commit ea2f5b1391
10 changed files with 450 additions and 107 deletions
--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -151,7 +151,7 @@ namespace cv { namespace gpu { namespace filters
    void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
    {
        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-        static const caller_t callers[3][17] = 
+        static const caller_t callers[5][17] = 
        {
            {
                0, 
@@ -170,7 +170,7 @@ namespace cv { namespace gpu { namespace filters
                linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
                linearRowFilter_caller<14, T, D, BrdRowReflect101>,
                linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<16, T, D, BrdRowReflect101>,
+                linearRowFilter_caller<16, T, D, BrdRowReflect101>
            },
            {
                0, 
@@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace filters
                linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
                linearRowFilter_caller<14, T, D, BrdRowReplicate>,
                linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<16, T, D, BrdRowReplicate>,
+                linearRowFilter_caller<16, T, D, BrdRowReplicate>
            },
            {
                0, 
@@ -208,7 +208,45 @@ namespace cv { namespace gpu { namespace filters
                linearRowFilter_caller<13, T, D, BrdRowConstant>,
                linearRowFilter_caller<14, T, D, BrdRowConstant>,
                linearRowFilter_caller<15, T, D, BrdRowConstant>, 
-                linearRowFilter_caller<16, T, D, BrdRowConstant>,
+                linearRowFilter_caller<16, T, D, BrdRowConstant>
+            },
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<2 , T, D, BrdRowReflect>,
+                linearRowFilter_caller<3 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<4 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<5 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<6 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<7 , T, D, BrdRowReflect>, 
+                linearRowFilter_caller<8 , T, D, BrdRowReflect>,
+                linearRowFilter_caller<9 , T, D, BrdRowReflect>,
+                linearRowFilter_caller<10, T, D, BrdRowReflect>, 
+                linearRowFilter_caller<11, T, D, BrdRowReflect>, 
+                linearRowFilter_caller<12, T, D, BrdRowReflect>, 
+                linearRowFilter_caller<13, T, D, BrdRowReflect>,
+                linearRowFilter_caller<14, T, D, BrdRowReflect>,
+                linearRowFilter_caller<15, T, D, BrdRowReflect>, 
+                linearRowFilter_caller<16, T, D, BrdRowReflect>
+            },
+            {
+                0, 
+                linearRowFilter_caller<1 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<2 , T, D, BrdRowWrap>,
+                linearRowFilter_caller<3 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<4 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<5 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<6 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<7 , T, D, BrdRowWrap>, 
+                linearRowFilter_caller<8 , T, D, BrdRowWrap>,
+                linearRowFilter_caller<9 , T, D, BrdRowWrap>,
+                linearRowFilter_caller<10, T, D, BrdRowWrap>, 
+                linearRowFilter_caller<11, T, D, BrdRowWrap>, 
+                linearRowFilter_caller<12, T, D, BrdRowWrap>, 
+                linearRowFilter_caller<13, T, D, BrdRowWrap>,
+                linearRowFilter_caller<14, T, D, BrdRowWrap>,
+                linearRowFilter_caller<15, T, D, BrdRowWrap>, 
+                linearRowFilter_caller<16, T, D, BrdRowWrap>
            }
        };
        
@@ -292,7 +330,7 @@ namespace cv { namespace gpu { namespace filters
    void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
    {
        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-        static const caller_t callers[3][17] = 
+        static const caller_t callers[5][17] = 
        {
            {
                0, 
@@ -311,7 +349,7 @@ namespace cv { namespace gpu { namespace filters
                linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
                linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
                linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<16, T, D, BrdColReflect101>, 
+                linearColumnFilter_caller<16, T, D, BrdColReflect101> 
            },
            {
                0, 
@@ -330,7 +368,7 @@ namespace cv { namespace gpu { namespace filters
                linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
                linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
                linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<16, T, D, BrdColReplicate>, 
+                linearColumnFilter_caller<16, T, D, BrdColReplicate>
            },
            {
                0, 
@@ -349,7 +387,45 @@ namespace cv { namespace gpu { namespace filters
                linearColumnFilter_caller<13, T, D, BrdColConstant>, 
                linearColumnFilter_caller<14, T, D, BrdColConstant>, 
                linearColumnFilter_caller<15, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<16, T, D, BrdColConstant>, 
+                linearColumnFilter_caller<16, T, D, BrdColConstant> 
+            },
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<2 , T, D, BrdColReflect>,
+                linearColumnFilter_caller<3 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<4 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<5 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<6 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<7 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<8 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<9 , T, D, BrdColReflect>, 
+                linearColumnFilter_caller<10, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<11, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<12, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<13, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<14, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<15, T, D, BrdColReflect>, 
+                linearColumnFilter_caller<16, T, D, BrdColReflect>
+            },
+            {
+                0, 
+                linearColumnFilter_caller<1 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<2 , T, D, BrdColWrap>,
+                linearColumnFilter_caller<3 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<4 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<5 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<6 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<7 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<8 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<9 , T, D, BrdColWrap>, 
+                linearColumnFilter_caller<10, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<11, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<12, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<13, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<14, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<15, T, D, BrdColWrap>, 
+                linearColumnFilter_caller<16, T, D, BrdColWrap>,
            }
        };
        
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -120,10 +120,10 @@ namespace cv { namespace gpu { namespace imgproc
    {
        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, T borderValue);

-        static const caller_t callers[2][3] = 
+        static const caller_t callers[2][5] = 
        {
-            { remap_caller<PointFilter, BrdReflect101>, remap_caller<PointFilter, BrdReplicate>, remap_caller<PointFilter, BrdConstant> },
-            { remap_caller<LinearFilter, BrdReflect101>, remap_caller<LinearFilter, BrdReplicate>, remap_caller<LinearFilter, BrdConstant> }
+            { remap_caller<PointFilter, BrdReflect101>, remap_caller<PointFilter, BrdReplicate>, remap_caller<PointFilter, BrdConstant>, remap_caller<PointFilter, BrdReflect>, remap_caller<PointFilter, BrdWrap> },
+            { remap_caller<LinearFilter, BrdReflect101>, remap_caller<LinearFilter, BrdReplicate>, remap_caller<LinearFilter, BrdConstant>, remap_caller<LinearFilter, BrdReflect>, remap_caller<LinearFilter, BrdWrap> }
        };

        typename VecTraits<T>::elem_type brd[] = {(typename VecTraits<T>::elem_type)borderValue[0], (typename VecTraits<T>::elem_type)borderValue[1], (typename VecTraits<T>::elem_type)borderValue[2], (typename VecTraits<T>::elem_type)borderValue[3]};
@@ -1089,7 +1089,7 @@ namespace cv { namespace gpu { namespace imgproc

        static const caller_t callers[] = 
        {
-            pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>
+            pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
        };

        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
@@ -1219,7 +1219,7 @@ namespace cv { namespace gpu { namespace imgproc

        static const caller_t callers[] = 
        {
-            pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>
+            pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
        };

        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -70,7 +70,9 @@ namespace cv
        {
            BORDER_REFLECT101_GPU = 0,
            BORDER_REPLICATE_GPU,
-            BORDER_CONSTANT_GPU
+            BORDER_CONSTANT_GPU,
+            BORDER_REFLECT_GPU,
+            BORDER_WRAP_GPU
        };
                
        // Converts CPU border extrapolation mode into GPU internal analogue.
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -718,7 +718,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
            nppFilter1D_callers[CV_MAT_CN(srcType)]));
    }
    
-    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

@@ -833,7 +833,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
            nppFilter1D_callers[CV_MAT_CN(bufType)]));
    }
    
-    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
   
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -133,7 +133,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp

    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR);

-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT);
+    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));

@@ -1228,24 +1228,26 @@ namespace

 bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
 {
-    if (cpuBorderType == cv::BORDER_REFLECT101)
+    switch (cpuBorderType)
    {
+    case cv::BORDER_REFLECT101:
        gpuBorderType = cv::gpu::BORDER_REFLECT101_GPU;
        return true;
-    }
-
-    if (cpuBorderType == cv::BORDER_REPLICATE)
-    {
+    case cv::BORDER_REPLICATE:
        gpuBorderType = cv::gpu::BORDER_REPLICATE_GPU;
        return true;
-    }
-    
-    if (cpuBorderType == cv::BORDER_CONSTANT)
-    {
+    case cv::BORDER_CONSTANT:
        gpuBorderType = cv::gpu::BORDER_CONSTANT_GPU;
        return true;
-    }
-
+    case cv::BORDER_REFLECT:
+        gpuBorderType = cv::gpu::BORDER_REFLECT_GPU;
+        return true;
+    case cv::BORDER_WRAP:
+        gpuBorderType = cv::gpu::BORDER_WRAP_GPU;
+        return true;
+    default:
+        return false;
+    };
    return false;
 }

@@ -1647,7 +1649,7 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& st

    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);

-    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

@@ -1683,7 +1685,7 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stre

    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);

-    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -246,55 +246,31 @@ namespace cv { namespace gpu { namespace device
        {
            return ::max(y, 0);
        }
-        __device__ __forceinline__ float idx_row_low(float y) const
-        {
-            return ::fmax(y, 0.0f);
-        }

        __device__ __forceinline__ int idx_row_high(int y) const 
        {
            return ::min(y, last_row);
        }
-        __device__ __forceinline__ float idx_row_high(float y) const 
-        {
-            return ::fmin(y, last_row);
-        }

        __device__ __forceinline__ int idx_row(int y) const
        {
            return idx_row_low(idx_row_high(y));
        }
-        __device__ __forceinline__ float idx_row(float y) const
-        {
-            return idx_row_low(idx_row_high(y));
-        }

        __device__ __forceinline__ int idx_col_low(int x) const
        {
            return ::max(x, 0);
        }
-        __device__ __forceinline__ float idx_col_low(float x) const
-        {
-            return ::fmax(x, 0);
-        }

        __device__ __forceinline__ int idx_col_high(int x) const 
        {
            return ::min(x, last_col);
        }
-        __device__ __forceinline__ float idx_col_high(float x) const 
-        {
-            return ::fmin(x, last_col);
-        }

        __device__ __forceinline__ int idx_col(int x) const
        {
            return idx_col_low(idx_col_high(x));
        }
-        __device__ __forceinline__ float idx_col(float x) const
-        {
-            return idx_col_low(idx_col_high(x));
-        }

        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
        {
@@ -421,55 +397,31 @@ namespace cv { namespace gpu { namespace device
        {
            return ::abs(y);
        }
-        __device__ __forceinline__ float idx_row_low(float y) const
-        {
-            return ::fabs(y);
-        }

        __device__ __forceinline__ int idx_row_high(int y) const 
        {
            return last_row - ::abs(last_row - y);
        }
-        __device__ __forceinline__ float idx_row_high(float y) const 
-        {
-            return last_row - ::fabs(last_row - y);
-        }

        __device__ __forceinline__ int idx_row(int y) const
        {
            return idx_row_low(idx_row_high(y));
        }
-        __device__ __forceinline__ float idx_row(float y) const
-        {
-            return idx_row_low(idx_row_high(y));
-        }

        __device__ __forceinline__ int idx_col_low(int x) const
        {
            return ::abs(x);
        }
-        __device__ __forceinline__ float idx_col_low(float x) const
-        {
-            return ::fabs(x);
-        }

        __device__ __forceinline__ int idx_col_high(int x) const 
        {
            return last_col - ::abs(last_col - x);
        }
-        __device__ __forceinline__ float idx_col_high(float x) const 
-        {
-            return last_col - ::fabs(last_col - x);
-        }

        __device__ __forceinline__ int idx_col(int x) const
        {
            return idx_col_low(idx_col_high(x));
        }
-        __device__ __forceinline__ float idx_col(float x) const
-        {
-            return idx_col_low(idx_col_high(x));
-        }

        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
        {
@@ -485,6 +437,308 @@ namespace cv { namespace gpu { namespace device
        const int last_col;
    };

+    //////////////////////////////////////////////////////////////
+    // BrdReflect
+
+    template <typename D> struct BrdRowReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) - (x < 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return last_col - ::abs(last_col - x) + (x > last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const 
+        {
+            return -last_col <= mini && maxi <= 2 * last_col;
+        }
+
+        const int last_col;
+    };
+
+    template <typename D> struct BrdColReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) - (y < 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return last_row - ::abs(last_row - y) + (y > last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const 
+        {
+            return -last_row <= mini && maxi <= 2 * last_row;
+        }
+
+        const int last_row;
+    };
+
+    template <typename D> struct BrdReflect
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReflect(int height, int width) : 
+            last_row(height - 1), last_col(width - 1) 
+        {
+        }
+        template <typename U> 
+        __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : 
+            last_row(height - 1), last_col(width - 1) 
+        {
+        }
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) - (y < 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return last_row - ::abs(last_row - y) + (y > last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) - (x < 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return last_col - ::abs(last_col - x) + (x > last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        const int last_row;
+        const int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdWrap
+
+    template <typename D> struct BrdRowWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return (x < width) * x + (x >= width) * (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+
+        const int width;
+    };
+
+    template <typename D> struct BrdColWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return (y < height) * y + (y >= height) * (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+
+        const int height;
+    };
+
+    template <typename D> struct BrdWrap
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) : 
+            height(height_), width(width_) 
+        {
+        }
+        template <typename U> 
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) : 
+            height(height_), width(width_) 
+        {
+        }
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return (y < height) * y + (y >= height) * (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return (x < width) * x + (x >= width) * (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        const int height;
+        const int width;
+    };
+
    //////////////////////////////////////////////////////////////
    // BorderReader