added custom implementation for nearest and linear interpolation

2013-08-22 12:31:10 +04:00 · 2013-08-22 12:31:10 +04:00 · 241cc417f9
commit 241cc417f9
parent 3b05acf936
1 changed files with 161 additions and 9 deletions
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@ -54,17 +54,68 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
+        template <typename T> __global__ void resize_nearest(const PtrStep<T> src, const float fx, const float fy, PtrStepSz<T> dst)
+        {
+            const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (dst_x < dst.cols && dst_y < dst.rows)
+            {
+                const float src_x = dst_x * fx;
+                const float src_y = dst_y * fy;
+
+                dst(dst_y, dst_x) = src(__float2int_rz(src_y), __float2int_rz(src_x));
+            }
+        }
+
+        template <typename T> __global__ void resize_linear(const PtrStepSz<T> src, const float fx, const float fy, PtrStepSz<T> dst)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+            const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (dst_x < dst.cols && dst_y < dst.rows)
+            {
+                const float src_x = dst_x * fx;
+                const float src_y = dst_y * fy;
+
+                work_type out = VecTraits<work_type>::all(0);
+
+                const int x1 = __float2int_rd(src_x);
+                const int y1 = __float2int_rd(src_y);
+                const int x2 = x1 + 1;
+                const int y2 = y1 + 1;
+                const int x2_read = ::min(x2, src.cols - 1);
+                const int y2_read = ::min(y2, src.rows - 1);
+
+                T src_reg = src(y1, x1);
+                out = out + src_reg * ((x2 - src_x) * (y2 - src_y));
+
+                src_reg = src(y1, x2_read);
+                out = out + src_reg * ((src_x - x1) * (y2 - src_y));
+
+                src_reg = src(y2_read, x1);
+                out = out + src_reg * ((x2 - src_x) * (src_y - y1));
+
+                src_reg = src(y2_read, x2_read);
+                out = out + src_reg * ((src_x - x1) * (src_y - y1));
+
+                dst(dst_y, dst_x) = saturate_cast<T>(out);
+            }
+        }
+
        template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, const float fx, const float fy, PtrStepSz<T> dst)
        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+            const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;

-            if (x < dst.cols && y < dst.rows)
+            if (dst_x < dst.cols && dst_y < dst.rows)
            {
-                const float xcoo = x * fx;
-                const float ycoo = y * fy;
+                const float src_x = dst_x * fx;
+                const float src_y = dst_y * fy;

-                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
+                dst(dst_y, dst_x) = src(src_y, src_x);
            }
        }

@ -77,12 +128,34 @@ namespace cv { namespace gpu { namespace device

                BrdReplicate<T> brd(src.rows, src.cols);
                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);

                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
                cudaSafeCall( cudaGetLastError() );
            }
        };
+        template <typename T> struct ResizeDispatcherStream<PointFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                resize_nearest<<<grid, block, 0, stream>>>(src, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+        template <typename T> struct ResizeDispatcherStream<LinearFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                resize_linear<<<grid, block, 0, stream>>>(src, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };

        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
        {
@ -101,6 +174,32 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
            }
        };
+        template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
+        {
+        static void call(PtrStepSz<T> src, PtrStepSz<T>, int, int, float fx, float fy, PtrStepSz<T> dst)
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                resize_nearest<<<grid, block>>>(src, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+        template <typename T> struct ResizeDispatcherNonStream<LinearFilter, T>
+        {
+        static void call(PtrStepSz<T> src, PtrStepSz<T>, int, int, float fx, float fy, PtrStepSz<T> dst)
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                resize_linear<<<grid, block>>>(src, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };

        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
@ -140,6 +239,58 @@ namespace cv { namespace gpu { namespace device
                    cudaSafeCall( cudaGetLastError() ); \
                    cudaSafeCall( cudaDeviceSynchronize() ); \
                } \
+            }; \
+            template <> struct ResizeDispatcherNonStream<PointFilter, type > \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
+                { \
+                    const dim3 block(32, 8); \
+                    const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type, srcWhole); \
+                    tex_resize_ ## type ## _reader texSrc; \
+                    texSrc.xoff = xoff; \
+                    texSrc.yoff = yoff; \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        PointFilter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate< type > brd(src.rows, src.cols); \
+                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
+                        PointFilter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <> struct ResizeDispatcherNonStream<LinearFilter, type > \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
+                { \
+                    const dim3 block(32, 8); \
+                    const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type, srcWhole); \
+                    tex_resize_ ## type ## _reader texSrc; \
+                    texSrc.xoff = xoff; \
+                    texSrc.yoff = yoff; \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        LinearFilter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate< type > brd(src.rows, src.cols); \
+                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
+                        LinearFilter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
            };

        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
@ -180,7 +331,7 @@ namespace cv { namespace gpu { namespace device

            if (x < dst.cols && y < dst.rows)
            {
-                dst(y, x) = saturate_cast<T>(src(y, x));
+                dst(y, x) = src(y, x);
            }
        }

@ -227,7 +378,8 @@ namespace cv { namespace gpu { namespace device
                ResizeDispatcher<CubicFilter, T>::call,
                ResizeAreaDispatcher<T>::call
            };
-            // chenge to linear if area interpolation upscaling
+
+            // change to linear if area interpolation upscaling
            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
                interpolation = 1;