From 17d9014373598b988499e818b8361fdf9660ee1a Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 6 Dec 2010 08:10:11 +0000
Subject: [PATCH] added per-element min/max to gpu module. fixed compile error
 in transform.

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  20 ++++
 modules/gpu/src/arithm.cpp              | 147 ++++++++++++++++++++++++
 modules/gpu/src/cuda/mathfunc.cu        | 124 ++++++++++++++++++++
 modules/gpu/src/cuda/transform.hpp      |   6 +-
 tests/gpu/src/arithm.cpp                |  25 ++++
 5 files changed, 319 insertions(+), 3 deletions(-)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 6f941f33b..a8d35944b 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -546,6 +546,26 @@ namespace cv
         CV_EXPORTS GpuMat operator & (const GpuMat& src1, const GpuMat& src2);
         CV_EXPORTS GpuMat operator ^ (const GpuMat& src1, const GpuMat& src2);
 
+        //! computes per-element minimum of two arrays (dst = min(src1, src2))
+        CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
+        //! Async version
+        CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
+
+        //! computes per-element minimum of array and scalar (dst = min(src1, src2))
+        CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst);
+        //! Async version
+        CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
+
+        //! computes per-element maximum of two arrays (dst = max(src1, src2))
+        CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst);
+        //! Async version
+        CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
+
+        //! computes per-element maximum of array and scalar (dst = max(src1, src2))
+        CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst);
+        //! Async version
+        CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
+
 
         ////////////////////////////// Image processing //////////////////////////////
 
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index e1ac3d2fa..0d9131416 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -98,6 +98,14 @@ cv::gpu::GpuMat cv::gpu::operator ~ (const GpuMat&) { throw_nogpu(); return GpuM
 cv::gpu::GpuMat cv::gpu::operator | (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
 cv::gpu::GpuMat cv::gpu::operator & (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
 cv::gpu::GpuMat cv::gpu::operator ^ (const GpuMat&, const GpuMat&) { throw_nogpu(); return GpuMat(); }
+void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
+void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
+void cv::gpu::min(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
+void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
+void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
+void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -1119,5 +1127,144 @@ cv::gpu::GpuMat cv::gpu::operator ^ (const GpuMat& src1, const GpuMat& src2)
     return dst;
 }
 
+//////////////////////////////////////////////////////////////////////////////
+// min/max
+
+namespace cv { namespace gpu { namespace mathfunc
+{
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+}}}
+
+namespace
+{
+    template <typename T>
+    void min_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+        mathfunc::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+    }
+
+    template <typename T>
+    void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
+    {
+        dst.create(src1.size(), src1.type());
+        mathfunc::min_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
+    }
+    
+    template <typename T>
+    void max_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        dst.create(src1.size(), src1.type());
+        mathfunc::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+    }
+
+    template <typename T>
+    void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
+    {
+        dst.create(src1.size(), src1.type());
+        mathfunc::max_gpu<T>(src1.reshape(1), src2, dst.reshape(1), stream);
+    }
+}
+
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
+{ 
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
+        min_caller<float>, min_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, 0);
+}
+
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
+{ 
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
+        min_caller<float>, min_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) 
+{
+    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
+        min_caller<float>, min_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, 0);
+}
+
+void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
+{
+    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        min_caller<uchar>, min_caller<char>, min_caller<ushort>, min_caller<short>, min_caller<int>, 
+        min_caller<float>, min_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
+{ 
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
+        max_caller<float>, max_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, 0);
+}
+
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
+{ 
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
+        max_caller<float>, max_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) 
+{
+    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
+        max_caller<float>, max_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, 0);
+}
+
+void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
+{
+    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[] = 
+    {
+        max_caller<uchar>, max_caller<char>, max_caller<ushort>, max_caller<short>, max_caller<int>, 
+        max_caller<float>, max_caller<double>
+    };
+    funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
+}
+
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index 1bf811a59..c088058ba 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -43,6 +43,7 @@
 #include "cuda_shared.hpp"
 #include "transform.hpp"
 #include "limits_gpu.hpp"
+#include "saturate_cast.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -1295,4 +1296,127 @@ namespace cv { namespace gpu { namespace mathfunc
     template void transpose_gpu<short2 >(const DevMem2D& src, const DevMem2D& dst);
     template void transpose_gpu<int    >(const DevMem2D& src, const DevMem2D& dst);
     template void transpose_gpu<float  >(const DevMem2D& src, const DevMem2D& dst);
+    
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// min/max
+
+    struct MinOp
+    {        
+        template <typename T>
+        __device__ T operator()(T a, T b)
+        {
+            return min(a, b);
+        }
+        __device__ float operator()(float a, float b)
+        {
+            return fmin(a, b);
+        }
+        __device__ double operator()(double a, double b)
+        {
+            return fmin(a, b);
+        }
+    };
+
+    struct MaxOp
+    {        
+        template <typename T>
+        __device__ T operator()(T a, T b)
+        {
+            return max(a, b);
+        }
+        __device__ float operator()(float a, float b)
+        {
+            return fmax(a, b);
+        }
+        __device__ double operator()(double a, double b)
+        {
+            return fmax(a, b);
+        }
+    };
+    
+    struct ScalarMinOp
+    {
+        double s;
+
+        explicit ScalarMinOp(double s_) : s(s_) {}
+
+        template <typename T>
+        __device__ T operator()(T a)
+        {
+            return saturate_cast<T>(fmin((double)a, s));
+        }
+    };
+    
+    struct ScalarMaxOp
+    {
+        double s;
+
+        explicit ScalarMaxOp(double s_) : s(s_) {}
+
+        template <typename T>
+        __device__ T operator()(T a)
+        {
+            return saturate_cast<T>(fmax((double)a, s));
+        }
+    };
+    
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        MinOp op;
+        transform(src1, src2, dst, op, stream);    
+    }
+
+    template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
+    template void min_gpu<char  >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        MaxOp op;
+        transform(src1, src2, dst, op, stream);    
+    }
+    
+    template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
+    template void max_gpu<char  >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+
+    template <typename T>
+    void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        ScalarMinOp op(src2);
+        transform(src1, dst, op, stream);    
+    }
+
+    template void min_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
+    template void min_gpu<char  >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
+    template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void min_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void min_gpu<int   >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void min_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
+    
+    template <typename T>
+    void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        ScalarMaxOp op(src2);
+        transform(src1, dst, op, stream);    
+    }
+
+    template void max_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
+    template void max_gpu<char  >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
+    template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
+    template void max_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
+    template void max_gpu<int   >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
+    template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
+    template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
 }}}
diff --git a/modules/gpu/src/cuda/transform.hpp b/modules/gpu/src/cuda/transform.hpp
index af516b3d9..c50d8d151 100644
--- a/modules/gpu/src/cuda/transform.hpp
+++ b/modules/gpu/src/cuda/transform.hpp
@@ -64,7 +64,7 @@ namespace cv { namespace gpu { namespace device
 
     //! Transform kernels
 
-    template <typename T, typename D, typename Mask, typename UnOp>
+    template <typename T, typename D, typename UnOp, typename Mask>
     static __global__ void transform(const DevMem2D_<T> src, PtrStep_<D> dst, const Mask mask, UnOp op)
     {
 		const int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -77,7 +77,7 @@ namespace cv { namespace gpu { namespace device
         }
     }
 
-    template <typename T1, typename T2, typename D, typename Mask, typename BinOp>
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
     static __global__ void transform(const DevMem2D_<T1> src1, const PtrStep_<T2> src2, PtrStep_<D> dst, const Mask mask, BinOp op)
     {
 		const int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -105,7 +105,7 @@ namespace cv
             grid.x = divUp(src.cols, threads.x);
             grid.y = divUp(src.rows, threads.y);        
 
-            device::transform<T, D, UnOp><<<grid, threads, 0, stream>>>(src, dst, device::NoMask(), op);
+            device::transform<T, D><<<grid, threads, 0, stream>>>(src, dst, device::NoMask(), op);
 
             if (stream == 0)
                 cudaSafeCall( cudaThreadSynchronize() );
diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp
index 29943c781..5b7d5d600 100644
--- a/tests/gpu/src/arithm.cpp
+++ b/tests/gpu/src/arithm.cpp
@@ -949,6 +949,30 @@ struct CV_GpuCountNonZeroTest: CvTest
     }
 };
 
+////////////////////////////////////////////////////////////////////////////////
+// min/max
+
+struct CV_GpuImageMinMaxTest : public CV_GpuArithmTest
+{
+    CV_GpuImageMinMaxTest() : CV_GpuArithmTest( "GPU-ImageMinMax", "min/max" ) {}
+
+    int test( const Mat& mat1, const Mat& mat2 )
+    {
+        cv::Mat cpuMinRes, cpuMaxRes;
+        cv::min(mat1, mat2, cpuMinRes);
+        cv::max(mat1, mat2, cpuMaxRes);
+
+        GpuMat gpu1(mat1);
+        GpuMat gpu2(mat2);
+        GpuMat gpuMinRes, gpuMaxRes;
+        cv::gpu::min(gpu1, gpu2, gpuMinRes);
+        cv::gpu::max(gpu1, gpu2, gpuMaxRes);
+
+        return CheckNorm(cpuMinRes, gpuMinRes) == CvTS::OK && CheckNorm(cpuMaxRes, gpuMaxRes) == CvTS::OK ?
+            CvTS::OK : CvTS::FAIL_GENERIC;
+    }
+};
+
 
 /////////////////////////////////////////////////////////////////////////////
 /////////////////// tests registration  /////////////////////////////////////
@@ -979,3 +1003,4 @@ CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test;
 CV_GpuMinMaxTest CV_GpuMinMaxTest_test;
 CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test;
 CV_GpuCountNonZeroTest CV_CountNonZero_test;
+CV_GpuImageMinMaxTest CV_GpuImageMinMax_test;