added tests for gpu::sum, it supports all data types, but single channel images only

2010-12-13 12:00:58 +00:00
parent 442cd75c32
commit 3997514b7c
5 changed files with 133 additions and 109 deletions
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -1419,6 +1419,15 @@ namespace cv { namespace gpu { namespace mathfunc
    namespace sum 
    {

+    template <typename T> struct SumType {};
+    template <> struct SumType<unsigned char> { typedef unsigned int R; };
+    template <> struct SumType<char> { typedef int R; };
+    template <> struct SumType<unsigned short> { typedef unsigned int R; };
+    template <> struct SumType<short> { typedef int R; };
+    template <> struct SumType<int> { typedef int R; };
+    template <> struct SumType<float> { typedef float R; };
+    template <> struct SumType<double> { typedef double R; };
+
    __constant__ int ctwidth;
    __constant__ int ctheight;
    __device__ unsigned int blocks_finished = 0;
@@ -1436,12 +1445,11 @@ namespace cv { namespace gpu { namespace mathfunc
    }


-    template <typename T>
    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
    {
        dim3 threads, grid;
        estimate_thread_cfg(cols, rows, threads, grid);
-        bufcols = grid.x * grid.y * sizeof(T);
+        bufcols = grid.x * grid.y * sizeof(double);
        bufrows = 1;
    }

@@ -1454,17 +1462,17 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
    }

-    template <typename T, int nthreads>
-    __global__ void sum_kernel(const DevMem2D_<T> src, T* result)
+    template <typename T, typename R, int nthreads>
+    __global__ void sum_kernel(const DevMem2D_<T> src, R* result)
    {
-        __shared__ T smem[nthreads];
+        __shared__ R smem[nthreads];

        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
        const int bid = blockIdx.y * gridDim.x + blockIdx.x;

-        T sum = 0;
+        R sum = 0;
        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
        {
            const T* ptr = src.ptr(y0 + y * blockDim.y);
@@ -1475,7 +1483,7 @@ namespace cv { namespace gpu { namespace mathfunc
        smem[tid] = sum;
        __syncthreads();

-        sum_in_smem<nthreads, T>(smem, tid);
+        sum_in_smem<nthreads, R>(smem, tid);

 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
        __shared__ bool is_last;
@@ -1496,7 +1504,7 @@ namespace cv { namespace gpu { namespace mathfunc
            smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
            __syncthreads();

-            sum_in_smem<nthreads, T>(smem, tid);
+            sum_in_smem<nthreads, R>(smem, tid);

            if (tid == 0) 
            {
@@ -1510,14 +1518,16 @@ namespace cv { namespace gpu { namespace mathfunc
    }


-    template <typename T, int nthreads>
-    __global__ void sum_pass2_kernel(T* result, int size)
+    template <typename T, typename R, int nthreads>
+    __global__ void sum_pass2_kernel(R* result, int size)
    {
-        __shared__ T smem[nthreads];
+        __shared__ R smem[nthreads];
        int tid = threadIdx.y * blockDim.x + threadIdx.x;

        smem[tid] = tid < size ? result[tid] : 0;
-        sum_in_smem<nthreads, T>(smem, tid);
+        __syncthreads();
+
+        sum_in_smem<nthreads, R>(smem, tid);

        if (tid == 0) 
            result[0] = smem[0];
@@ -1527,60 +1537,61 @@ namespace cv { namespace gpu { namespace mathfunc


    template <typename T>
-    T sum_multipass_caller(const DevMem2D_<T> src, PtrStep buf)
+    void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum)
    {
        using namespace sum;
+        typedef typename SumType<T>::R R;

        dim3 threads, grid;
        estimate_thread_cfg(src.cols, src.rows, threads, grid);
        set_kernel_consts(src.cols, src.rows, threads, grid);

-        T* buf_ = (T*)buf.ptr(0);
+        R* buf_ = (R*)buf.ptr(0);

-        sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
-        sum_pass2_kernel<T, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+        sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
+        sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
                buf_, grid.x * grid.y);
        cudaSafeCall(cudaThreadSynchronize());

-        T sum;
-        cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
-        
-        return sum;
+        R result = 0;
+        cudaSafeCall(cudaMemcpy(&result, buf_, result, cudaMemcpyDeviceToHost));
+        sum[0] = result;
    }  

-    template unsigned char sum_multipass_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
-    template char sum_multipass_caller<char>(const DevMem2D_<char>, PtrStep);
-    template unsigned short sum_multipass_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
-    template short sum_multipass_caller<short>(const DevMem2D_<short>, PtrStep);
-    template int sum_multipass_caller<int>(const DevMem2D_<int>, PtrStep);
-    template float sum_multipass_caller<float>(const DevMem2D_<float>, PtrStep);
+    template void sum_multipass_caller<unsigned char>(const DevMem2D, PtrStep, double*);
+    template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*);
+    template void sum_multipass_caller<unsigned short>(const DevMem2D, PtrStep, double*);
+    template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*);
+    template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*);
+    template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*);


    template <typename T>
-    T sum_caller(const DevMem2D_<T> src, PtrStep buf)
+    void sum_caller(const DevMem2D src, PtrStep buf, double* sum)
    {
        using namespace sum;
+        typedef typename SumType<T>::R R;

        dim3 threads, grid;
        estimate_thread_cfg(src.cols, src.rows, threads, grid);
        set_kernel_consts(src.cols, src.rows, threads, grid);

-        T* buf_ = (T*)buf.ptr(0);
+        R* buf_ = (R*)buf.ptr(0);

-        sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
+        sum_kernel<T, R, threads_x * threads_y><<<grid, threads>>>((const DevMem2D_<T>)src, buf_);
        cudaSafeCall(cudaThreadSynchronize());

-        T sum;
-        cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
-
-        return sum;
+        R result = 0;
+        cudaSafeCall(cudaMemcpy(&result, buf_, sizeof(result), cudaMemcpyDeviceToHost));
+        sum[0] = result;
    }  

-    template unsigned char sum_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
-    template char sum_caller<char>(const DevMem2D_<char>, PtrStep);
-    template unsigned short sum_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
-    template short sum_caller<short>(const DevMem2D_<short>, PtrStep);
-    template int sum_caller<int>(const DevMem2D_<int>, PtrStep);
-    template float sum_caller<float>(const DevMem2D_<float>, PtrStep);
-    template double sum_caller<double>(const DevMem2D_<double>, PtrStep);
+    template void sum_caller<unsigned char>(const DevMem2D, PtrStep, double*);
+    template void sum_caller<char>(const DevMem2D, PtrStep, double*);
+    template void sum_caller<unsigned short>(const DevMem2D, PtrStep, double*);
+    template void sum_caller<short>(const DevMem2D, PtrStep, double*);
+    template void sum_caller<int>(const DevMem2D, PtrStep, double*);
+    template void sum_caller<float>(const DevMem2D, PtrStep, double*);
+    template void sum_caller<double>(const DevMem2D, PtrStep, double*);
 }}}
+