From f2c30cd90d46caf28d1933ff741a06cc7a252d5e Mon Sep 17 00:00:00 2001
From: Marina Kolpakova <no@email>
Date: Sat, 9 Jun 2012 15:24:01 +0000
Subject: [PATCH] resize area for big downscaling integration

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  6 ++-
 modules/gpu/perf/perf_imgproc.cpp       | 34 +++++++++++++
 modules/gpu/perf_cpu/perf_imgproc.cpp   | 31 ++++++++++++
 modules/gpu/src/cuda/resize.cu          | 48 +++++++++++++++++-
 modules/gpu/src/resize.cpp              | 67 ++++++++++++++++++++++++-
 5 files changed, 183 insertions(+), 3 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 2d1902245..0e01d9cfc 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -626,9 +626,13 @@ CV_EXPORTS void swapChannels(GpuMat& image, const int dstOrder[4], Stream& strea
 CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
 
 //! resizes the image
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
 CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
 
+//! resizes the image
+//! Supports INTER_AREA
+CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, GpuMat& buffer, double fx=0, double fy=0, int interpolation = INTER_AREA, Stream& stream = Stream::Null());
+
 //! warps the image using affine transformation
 //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
 CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
index b261ad086..78ca5bf0e 100644
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -90,6 +90,40 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
                     Interpolation(cv::INTER_CUBIC),   Interpolation(cv::INTER_AREA)),
     testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));
 
+GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
+{
+    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+    int interpolation = cv::INTER_AREA;
+    double f = GET_PARAM(3);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0, 255);
+
+    cv::gpu::GpuMat src(src_host);
+    cv::gpu::GpuMat dst;
+
+    cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
+
+    declare.time(1.0);
+
+    TEST_CYCLE()
+    {
+        cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(perf::sz1080p, cv::Size(4096, 2048)),
+    testing::Values(MatType(CV_8UC1)/*, MatType(CV_8UC3), MatType(CV_8UC4),
+                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
+                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
+    testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
+
 //////////////////////////////////////////////////////////////////////
 // WarpAffine
 
diff --git a/modules/gpu/perf_cpu/perf_imgproc.cpp b/modules/gpu/perf_cpu/perf_imgproc.cpp
index 9fe9fd07a..9a1adde81 100644
--- a/modules/gpu/perf_cpu/perf_imgproc.cpp
+++ b/modules/gpu/perf_cpu/perf_imgproc.cpp
@@ -80,6 +80,37 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
                     Interpolation(cv::INTER_CUBIC),   Interpolation(cv::INTER_AREA)),
     testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));
 
+GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
+{
+    cv::Size size = GET_PARAM(1);
+    int type = GET_PARAM(2);
+    int interpolation = cv::INTER_AREA;
+    double f = GET_PARAM(3);
+
+    cv::Mat src_host(size, type);
+    fill(src_host, 0, 255);
+
+    cv::Mat src(src_host);
+    cv::Mat dst;
+
+    cv::resize(src, dst, cv::Size(), f, f, interpolation);
+
+    declare.time(1.0);
+
+    TEST_CYCLE()
+    {
+        cv::resize(src, dst, cv::Size(), f, f, interpolation);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(perf::sz1080p, cv::Size(4096, 2048)),
+    testing::Values(MatType(CV_8UC1)/*,  MatType(CV_8UC3), MatType(CV_8UC4),
+                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
+                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
+    testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
+
 //////////////////////////////////////////////////////////////////////
 // WarpAffine
 
diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu
index a51c04306..a0686d0d5 100644
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -116,7 +116,6 @@ namespace cv { namespace gpu { namespace device
             {
                 dim3 block(32, 8);
                 dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
                 BrdConstant<T> brd(src.rows, src.cols);
                 BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
                 IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
@@ -278,5 +277,52 @@ namespace cv { namespace gpu { namespace device
         //template void resize_gpu<float2>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
         template void resize_gpu<float3>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
         template void resize_gpu<float4>(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream);
+
+        template<typename T> struct scan_traits{};
+
+        template<> struct scan_traits<uchar>
+        {
+            typedef int scan_line_type;
+        };
+
+        template <typename Ptr2D, typename T>
+        __global__ void resize_area_scan(const Ptr2D src, int fx, int fy, DevMem2D_<T> dst, DevMem2D_<T> buffer)
+        {
+            typedef typename scan_traits<T>::scan_line_type W;
+            extern __shared__ W line[];
+
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        }
+
+        template <typename T> struct InterAreaDispatcherStream
+        {
+            static void call(DevMem2D_<T> src, int fx, int fy, DevMem2D_<T> dst, DevMem2D_<T> buffer, cudaStream_t stream)
+            {
+                dim3 block(256, 1);
+                dim3 grid(divUp(dst.cols, block.x), 1);
+
+                resize_area_scan<<<grid, block, 256 * 2 * sizeof(typename scan_traits<T>::scan_line_type) >>>(src, fx, fy, dst, buffer);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T>
+        void resize_area_gpu(DevMem2Db src, DevMem2Db dst,float fx, float fy,
+                             int interpolation, DevMem2Db buffer, cudaStream_t stream)
+        {
+            (void)interpolation;
+
+            int iscale_x = round(fx);
+            int iscale_y = round(fy);
+
+            InterAreaDispatcherStream<T>::call(src, iscale_x, iscale_y, dst, buffer, stream);
+        }
+
+        template void resize_area_gpu<uchar>(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Db buffer, cudaStream_t stream);
+
     } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
diff --git a/modules/gpu/src/resize.cpp b/modules/gpu/src/resize.cpp
index 672dbc288..4f8ea0517 100644
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@@ -44,7 +44,32 @@
 
 #ifndef HAVE_CUDA
 
-void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
+{
+    (void)src;
+    (void)dst;
+    (void)dsize;
+    (void)fx;
+    (void)fy;
+    (void)interpolation;
+    (void)s;
+
+    throw_nogpu();
+}
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy,
+                     int interpolation, const GpuMat& buffer, Stream& s)
+{
+    (void)src;
+    (void)dst;
+    (void)dsize;
+    (void)fx;
+    (void)fy;
+    (void)interpolation;
+    (void)buffer;
+    (void)s;
+
+    throw_nogpu();
+}
 
 #else // HAVE_CUDA
 
@@ -55,9 +80,49 @@ namespace cv { namespace gpu { namespace device
         template <typename T>
         void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
                         DevMem2Db dst, int interpolation, cudaStream_t stream);
+
+        template <typename T>
+        void resize_area_gpu(DevMem2Db src, DevMem2Db dst,float fx, float fy,
+                             int interpolation, DevMem2Db buffer, cudaStream_t stream);
     }
 }}}
 
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, GpuMat& buffer, double fx, double fy,
+                     int interpolation, Stream& s)
+{
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(interpolation == INTER_AREA);
+    CV_Assert( (fx < 1.0) && (fy < 1.0));
+    CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
+
+    if (dsize == Size())
+        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
+    else
+    {
+        fx = static_cast<double>(dsize.width) / src.cols;
+        fy = static_cast<double>(dsize.height) / src.rows;
+    }
+
+    fx = static_cast<float>(1.0 / fx);
+    fy = static_cast<float>(1.0 / fy);
+
+    dst.create(dsize, src.type());
+    buffer.create(cv::Size(dsize.width, src.rows), src.type());
+
+    if (dsize == src.size())
+    {
+        if (s)
+            s.enqueueCopy(src, dst);
+        else
+            src.copyTo(dst);
+        return;
+    }
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    cv::gpu::device::imgproc::resize_area_gpu<uchar>(src, dst, fx, fy, interpolation, buffer, stream);
+}
+
 void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
 {
     CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);