added tests for gpu::sum, it supports all data types, but single channel images only

2010-12-13 12:00:58 +00:00 · 2010-12-13 12:00:58 +00:00 · 3997514b7c
commit 3997514b7c
parent 442cd75c32
5 changed files with 133 additions and 109 deletions
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -421,9 +421,12 @@ namespace cv
        CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode);
        //! computes sum of array elements
-        //! supports CV_8UC1, CV_8UC4 types
+        //! supports only single channel images
-        //! disabled until fix crash
+        CV_EXPORTS Scalar sum(const GpuMat& src);
-        CV_EXPORTS Scalar sum(const GpuMat& m);
+
        //! computes sum of array elements
        //! supports only single channel images
        CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
        //! finds global minimum and maximum array elements and returns their values
        CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -65,6 +65,7 @@ double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
 void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
 Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_nogpu(); }
@ -480,36 +481,50 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode)
 ////////////////////////////////////////////////////////////////////////
 // sum
 namespace cv { namespace gpu { namespace mathfunc
 {
    template <typename T>
    void sum_caller(const DevMem2D src, PtrStep buf, double* sum);
    template <typename T>
    void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum);
    namespace sum
    {
        void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
    }
 }}}
 Scalar cv::gpu::sum(const GpuMat& src) 
 {
-    CV_Assert(!"disabled until fix crash");
+    GpuMat buf;
    return sum(src, buf);
 }
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4);
+Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) 
 {
    using namespace mathfunc;
    CV_Assert(src.channels() == 1);
-    NppiSize sz;
+    typedef void (*Caller)(const DevMem2D, PtrStep, double*);
-    sz.width  = src.cols;
+    static const Caller callers[2][7] = 
-    sz.height = src.rows;
+        { { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>, 
            sum_multipass_caller<unsigned short>, sum_multipass_caller<short>, 
            sum_multipass_caller<int>, sum_multipass_caller<float>, 0 },
          { sum_caller<unsigned char>, sum_caller<char>, 
            sum_caller<unsigned short>, sum_caller<short>, 
            sum_caller<int>, sum_caller<float>, sum_caller<double> } };
-    Scalar res;
+    Size bufSize;
    sum::get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); 
    buf.create(bufSize, CV_8U);
-    int bufsz;
+    Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
    if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
-    if (src.type() == CV_8UC1)
+    double result;
-    {
+    caller(src, buf, &result);
-        nppiReductionGetBufferHostSize_8u_C1R(sz, &bufsz);
+    return result;
        GpuMat buf(1, bufsz, CV_32S);
        nppSafeCall( nppiSum_8u_C1R(src.ptr<Npp8u>(), src.step, sz, buf.ptr<Npp32s>(), res.val) );
    }
    else
    {
        nppiReductionGetBufferHostSize_8u_C4R(sz, &bufsz);
        GpuMat buf(1, bufsz, CV_32S);
        nppSafeCall( nppiSum_8u_C4R(src.ptr<Npp8u>(), src.step, sz, buf.ptr<Npp32s>(), res.val) );
    }
    return res;
 }
 ////////////////////////////////////////////////////////////////////////
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@ -1419,6 +1419,15 @@ namespace cv { namespace gpu { namespace mathfunc
    namespace sum 
    {
    template <typename T> struct SumType {};
    template <> struct SumType<unsigned char> { typedef unsigned int R; };
    template <> struct SumType<char> { typedef int R; };
    template <> struct SumType<unsigned short> { typedef unsigned int R; };
    template <> struct SumType<short> { typedef int R; };
    template <> struct SumType<int> { typedef int R; };
    template <> struct SumType<float> { typedef float R; };
    template <> struct SumType<double> { typedef double R; };
    __constant__ int ctwidth;
    __constant__ int ctheight;
    __device__ unsigned int blocks_finished = 0;
@ -1436,12 +1445,11 @@ namespace cv { namespace gpu { namespace mathfunc
    }
    template <typename T>
    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
    {
        dim3 threads, grid;
        estimate_thread_cfg(cols, rows, threads, grid);
-        bufcols = grid.x * grid.y * sizeof(T);
+        bufcols = grid.x * grid.y * sizeof(double);
        bufrows = 1;
    }
@ -1454,17 +1462,17 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
    }
-    template <typename T, int nthreads>
+    template <typename T, typename R, int nthreads>
-    __global__ void sum_kernel(const DevMem2D_<T> src, T* result)
+    __global__ void sum_kernel(const DevMem2D_<T> src, R* result)
    {
-        __shared__ T smem[nthreads];
+        __shared__ R smem[nthreads];
        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-        T sum = 0;
+        R sum = 0;
        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
        {
            const T* ptr = src.ptr(y0 + y * blockDim.y);
@ -1475,7 +1483,7 @@ namespace cv { namespace gpu { namespace mathfunc
        smem[tid] = sum;
        __syncthreads();
-        sum_in_smem<nthreads, T>(smem, tid);
+        sum_in_smem<nthreads, R>(smem, tid);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
        __shared__ bool is_last;
@ -1496,7 +1504,7 @@ namespace cv { namespace gpu { namespace mathfunc
            smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
            __syncthreads();
-            sum_in_smem<nthreads, T>(smem, tid);
+            sum_in_smem<nthreads, R>(smem, tid);
            if (tid == 0) 
            {
@ -1510,14 +1518,16 @@ namespace cv { namespace gpu { namespace mathfunc
    }
-    template <typename T, int nthreads>
+    template <typename T, typename R, int nthreads>
-    __global__ void sum_pass2_kernel(T* result, int size)
+    __global__ void sum_pass2_kernel(R* result, int size)
    {
-        __shared__ T smem[nthreads];
+        __shared__ R smem[nthreads];
        int tid = threadIdx.y * blockDim.x + threadIdx.x;
        smem[tid] = tid < size ? result[tid] : 0;
-        sum_in_smem<nthreads, T>(smem, tid);
+        __syncthreads();
        sum_in_smem<nthreads, R>(smem, tid);
        if (tid == 0) 
            result[0] = smem[0];
@ -1527,60 +1537,61 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
-    T sum_multipass_caller(const DevMem2D_<T> src, PtrStep buf)
+    void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)
    {
        using namespace sum;
        typedef typename SumType<T>::R R;
        dim3 threads, grid;
        estimate_thread_cfg(src.cols, src.rows, threads, grid);
        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* buf_ = (T*)buf.ptr(0);
+        R* buf_ = (R*)buf.ptr(0);
-        sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
+        sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
-        sum_pass2_kernel<T, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+        sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
                buf_, grid.x * grid.y);
        cudaSafeCall(cudaThreadSynchronize());
-        T sum;
+        R result = 0;
-        cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&result, buf_, result, cudaMemcpyDeviceToHost));
-        
+        sum[0] = result;
        return sum;
    }  
-    template unsigned char sum_multipass_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
+    template void sum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);
-    template char sum_multipass_caller<char>(const DevMem2D_<char>, PtrStep);
+    template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*);
-    template unsigned short sum_multipass_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
+    template void sum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);
-    template short sum_multipass_caller<short>(const DevMem2D_<short>, PtrStep);
+    template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*);
-    template int sum_multipass_caller<int>(const DevMem2D_<int>, PtrStep);
+    template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*);
-    template float sum_multipass_caller<float>(const DevMem2D_<float>, PtrStep);
+    template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*);
    template <typename T>
-    T sum_caller(const DevMem2D_<T> src, PtrStep buf)
+    void sum_caller(const DevMem2D src, PtrStep buf, double* sum)
    {
        using namespace sum;
        typedef typename SumType<T>::R R;
        dim3 threads, grid;
        estimate_thread_cfg(src.cols, src.rows, threads, grid);
        set_kernel_consts(src.cols, src.rows, threads, grid);
-        T* buf_ = (T*)buf.ptr(0);
+        R* buf_ = (R*)buf.ptr(0);
-        sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
+        sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
        cudaSafeCall(cudaThreadSynchronize());
-        T sum;
+        R result = 0;
-        cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
+        cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));
-
+        sum[0] = result;
        return sum;
    }  
-    template unsigned char sum_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
+    template void sum_caller<unsigned char>(const DevMem2D, PtrStep, double*);
-    template char sum_caller<char>(const DevMem2D_<char>, PtrStep);
+    template void sum_caller<char>(const DevMem2D, PtrStep, double*);
-    template unsigned short sum_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
+    template void sum_caller<unsigned short>(const DevMem2D, PtrStep, double*);
-    template short sum_caller<short>(const DevMem2D_<short>, PtrStep);
+    template void sum_caller<short>(const DevMem2D, PtrStep, double*);
-    template int sum_caller<int>(const DevMem2D_<int>, PtrStep);
+    template void sum_caller<int>(const DevMem2D, PtrStep, double*);
-    template float sum_caller<float>(const DevMem2D_<float>, PtrStep);
+    template void sum_caller<float>(const DevMem2D, PtrStep, double*);
-    template double sum_caller<double>(const DevMem2D_<double>, PtrStep);
+    template void sum_caller<double>(const DevMem2D, PtrStep, double*);
 }}}
--- a/tests/gpu/src/arithm.cpp
+++ b/tests/gpu/src/arithm.cpp
@ -458,29 +458,6 @@ struct CV_GpuNppImageFlipTest : public CV_GpuArithmTest
    }
 };
 ////////////////////////////////////////////////////////////////////////////////
 // sum
 struct CV_GpuNppImageSumTest : public CV_GpuArithmTest
 {
    CV_GpuNppImageSumTest() : CV_GpuArithmTest( "GPU-NppImageSum", "sum" ) {}
    int test( const Mat& mat1, const Mat& )
    {
        if (mat1.type() != CV_8UC1 && mat1.type() != CV_8UC4)
        {
            ts->printf(CvTS::LOG, "\tUnsupported type\t");
            return CvTS::OK;
        }
        Scalar cpures = cv::sum(mat1);
        GpuMat gpu1(mat1);
        Scalar gpures = cv::gpu::sum(gpu1);
        return CheckNorm(cpures, gpures);
    }
 };
 ////////////////////////////////////////////////////////////////////////////////
 // LUT
 struct CV_GpuNppImageLUTTest : public CV_GpuArithmTest
@ -949,27 +926,49 @@ struct CV_GpuCountNonZeroTest: CvTest
    }
 };
 ////////////////////////////////////////////////////////////////////////////////
 // min/max
-struct CV_GpuImageMinMaxTest : public CV_GpuArithmTest
+//////////////////////////////////////////////////////////////////////////////
 // sum
 struct CV_GpuSumTest: CvTest 
 {
-    CV_GpuImageMinMaxTest() : CV_GpuArithmTest( "GPU-ImageMinMax", "min/max" ) {}
+    CV_GpuSumTest(): CvTest("GPU-SumTest", "sum") {}
-    int test( const Mat& mat1, const Mat& mat2 )
+    void run(int) 
    {
-        cv::Mat cpuMinRes, cpuMaxRes;
+        try
-        cv::min(mat1, mat2, cpuMinRes);
+        {
-        cv::max(mat1, mat2, cpuMaxRes);
+            Mat src;
            Scalar a, b;
            double max_err = 1e-6;
-        GpuMat gpu1(mat1);
+            int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F;
-        GpuMat gpu2(mat2);
+            for (int type = CV_8U; type <= typemax; ++type) 
-        GpuMat gpuMinRes, gpuMaxRes;
+            {
-        cv::gpu::min(gpu1, gpu2, gpuMinRes);
+                gen(1 + rand() % 1000, 1 + rand() % 1000, type, src);
-        cv::gpu::max(gpu1, gpu2, gpuMaxRes);
+                a = sum(src);
                b = sum(GpuMat(src));
                if (abs(a[0] - b[0]) > src.size().area() * max_err)
                {
                    ts->printf(CvTS::CONSOLE, "cols: %d, rows: %d, expected: %f, actual: %f\n", src.cols, src.rows, a[0], b[0]);
                    ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
                    return;
                }
            }
        }
        catch (const Exception& e)
        {
            if (!check_and_treat_gpu_exception(e, ts)) throw;
            return;
        }
    }
    void gen(int cols, int rows, int type, Mat& m)
    {
        m.create(rows, cols, type);
        RNG rng;
        rng.fill(m, RNG::UNIFORM, Scalar::all(0), Scalar::all(20));
        return CheckNorm(cpuMinRes, gpuMinRes) == CvTS::OK && CheckNorm(cpuMaxRes, gpuMaxRes) == CvTS::OK ?
            CvTS::OK : CvTS::FAIL_GENERIC;
    }
 };
@ -992,7 +991,6 @@ CV_GpuNppImageCompareTest CV_GpuNppImageCompare_test;
 CV_GpuNppImageMeanStdDevTest CV_GpuNppImageMeanStdDev_test;
 CV_GpuNppImageNormTest CV_GpuNppImageNorm_test;
 CV_GpuNppImageFlipTest CV_GpuNppImageFlip_test;
 CV_GpuNppImageSumTest CV_GpuNppImageSum_test;
 CV_GpuNppImageLUTTest CV_GpuNppImageLUT_test;
 CV_GpuNppImageExpTest CV_GpuNppImageExp_test;
 CV_GpuNppImageLogTest CV_GpuNppImageLog_test;
@ -1003,4 +1001,4 @@ CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test;
 CV_GpuMinMaxTest CV_GpuMinMaxTest_test;
 CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test;
 CV_GpuCountNonZeroTest CV_CountNonZero_test;
-CV_GpuImageMinMaxTest CV_GpuImageMinMax_test;
+CV_GpuSumTest CV_GpuSum_test;
--- a/tests/gpu/src/gputest_main.cpp
+++ b/tests/gpu/src/gputest_main.cpp
@ -46,9 +46,6 @@ CvTS test_system("gpu");
 const char* blacklist[] =
 {
    "GPU-AsyncGpuMatOperator",     // crash
    "GPU-NppImageSum",              // crash, probably npp bug
    "GPU-NppImageCanny",            // NPP_TEXTURE_BIND_ERROR
    0
 };