fixed minor bug in gpu module, added first version of sum

2010-12-10 13:36:00 +00:00
parent d557c800a7
commit 9f80317ffa
3 changed files with 179 additions and 20 deletions
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc
 ////////////////////////////// Column Sum //////////////////////////////////////
-    __global__ void columnSumKernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
+    __global__ void column_sum_kernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
    {
        int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
        dim3 threads(256);
        dim3 grid(divUp(src.cols, threads.x));
-        columnSumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
+        column_sum_kernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
        cudaSafeCall(cudaThreadSynchronize());
    }
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -450,6 +450,8 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        threads = dim3(32, 8);
        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
        grid.x = min(grid.x, threads.x);
        grid.y = min(grid.y, threads.y);
    }
@@ -662,7 +664,6 @@ namespace cv { namespace gpu { namespace mathfunc
        {
            minval[0] = (T)sminval[0];
            maxval[0] = (T)smaxval[0];
            blocks_finished = 0;
        }
    }
@@ -744,6 +745,8 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        threads = dim3(32, 8);
        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
        grid.x = min(grid.x, threads.x);
        grid.y = min(grid.y, threads.y);
    }
@@ -1005,7 +1008,6 @@ namespace cv { namespace gpu { namespace mathfunc
            maxval[0] = (T)smaxval[0];
            minloc[0] = sminloc[0];
            maxloc[0] = smaxloc[0];
            blocks_finished = 0;
        }
    }
@@ -1102,6 +1104,8 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        threads = dim3(32, 8);
        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
        grid.x = min(grid.x, threads.x);
        grid.y = min(grid.y, threads.y);
    }
@@ -1212,13 +1216,12 @@ namespace cv { namespace gpu { namespace mathfunc
        unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
        scount[tid] = tid < size ? count[tid] : 0;
-		sum_in_smem<nthreads, unsigned int>(scount, tid);
+        __syncthreads();
-		if (tid == 0) 
+        sum_in_smem<nthreads, unsigned int>(scount, tid);
-        {
+
        if (tid == 0) 
            count[0] = scount[0];
            blocks_finished = 0;
        }
    }
@@ -1409,4 +1412,171 @@ namespace cv { namespace gpu { namespace mathfunc
    template void max_gpu<int   >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
    template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
    template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
 //////////////////////////////////////////////////////////////////////////////
 // Sum
    namespace sum 
    {
    __constant__ int ctwidth;
    __constant__ int ctheight;
    __device__ unsigned int blocks_finished = 0;
    const int threads_x = 32;
    const int threads_y = 8;
    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
    {
        threads = dim3(threads_x, threads_y);
        grid = dim3(divUp(cols, threads.x * threads.y), 
                    divUp(rows, threads.y * threads.x));
        grid.x = min(grid.x, threads.x);
        grid.y = min(grid.y, threads.y);
    }
    template <typename T>
    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
    {
        dim3 threads, grid;
        estimate_thread_cfg(cols, rows, threads, grid);
        bufcols = grid.x * grid.y * sizeof(T);
        bufrows = 1;
    }
    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
    {        
        int twidth = divUp(divUp(cols, grid.x), threads.x);
        int theight = divUp(divUp(rows, grid.y), threads.y);
        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
    }
    template <typename T, int nthreads>
    __global__ void sum_kernel(const DevMem2D_<T> src, T* result)
    {
        __shared__ T smem[nthreads];
        const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
        const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
        T sum = 0;
        for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
        {
            const T* ptr = src.ptr(y0 + y * blockDim.y);
            for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
                sum += ptr[x0 + x * blockDim.x];
        }
        smem[tid] = sum;
        __syncthreads();
        sum_in_smem<nthreads, T>(smem, tid);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
        __shared__ bool is_last;
        if (tid == 0)
        {
            result[bid] = smem[0];
            __threadfence();
            unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
            is_last = (ticket == gridDim.x * gridDim.y - 1);
        }
        __syncthreads();
        if (is_last)
        {
            smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
            __syncthreads();
            sum_in_smem<nthreads, T>(smem, tid);
            if (tid == 0) 
            {
                result[0] = smem[0];
                blocks_finished = 0;
            }
        }
 #else
        if (tid == 0) result[bid] = smem[0];
 #endif
    }
    template <typename T>
    T sum_caller(const DevMem2D_<T> src, PtrStep buf)
    {
        dim3 threads, grid;
        estimate_thread_cfg(src.cols, src.rows, threads, grid);
        set_kernel_consts(src.cols, src.rows, threads, grid);
        T* buf_ = (T*)buf.ptr(0);
        sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
        cudaSafeCall(cudaThreadSynchronize());
        T sum;
        cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
        return sum;
    }  
    template unsigned char sum_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
    template char sum_caller<char>(const DevMem2D_<char>, PtrStep);
    template unsigned short sum_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
    template short sum_caller<short>(const DevMem2D_<short>, PtrStep);
    template int sum_caller<int>(const DevMem2D_<int>, PtrStep);
    template float sum_caller<float>(const DevMem2D_<float>, PtrStep);
    template double sum_caller<double>(const DevMem2D_<double>, PtrStep);
    template <typename T, int nthreads>
    __global__ void sum_pass2_kernel(T* result, int size)
    {
        __shared__ T smem[nthreads];
        int tid = threadIdx.y * blockDim.x + threadIdx.x;
        smem[tid] = tid < size ? result[tid] : 0;
        sum_in_smem<nthreads, T>(smem, tid);
        if (tid == 0) 
            result[0] = smem[0];
    }
    template <typename T>
    T sum_multipass_caller(const DevMem2D_<T> src, PtrStep buf)
    {
        dim3 threads, grid;
        estimate_thread_cfg(src.cols, src.rows, threads, grid);
        set_kernel_consts(src.cols, src.rows, threads, grid);
        T* buf_ = (T*)buf.ptr(0);
        sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
        sum_pass2_kernel<T, threads_x * threads_y><<<1, threads_x * threads_y>>>(
                buf_, grid.x * grid.y);
        cudaSafeCall(cudaThreadSynchronize());
        T sum;
        cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
        return sum;
    }  
    template unsigned char sum_multipass_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
    template char sum_multipass_caller<char>(const DevMem2D_<char>, PtrStep);
    template unsigned short sum_multipass_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
    template short sum_multipass_caller<short>(const DevMem2D_<short>, PtrStep);
    template int sum_multipass_caller<int>(const DevMem2D_<int>, PtrStep);
    template float sum_multipass_caller<float>(const DevMem2D_<float>, PtrStep);
    } // namespace sum
 }}}
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@@ -244,17 +244,6 @@ namespace
    {
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
        imgproc::matchTemplateNaive_8U_SQDIFF(image, templ, result);
        //GpuMat image_sum;
        //GpuMat image_sumsq;
        //integral(image, image_sum, image_sumsq);
        //float templ_sumsq = 0.f;
        //matchTemplate_8U_CCORR(image, templ, result);
        //imgproc::matchTemplatePrepared_8U_SQDIFF(
        //        templ.cols, templ.rows, image_sumsq, templ_sumsq, result);
    }