implemented gpu::reduce

2011-09-22 07:08:26 +00:00
parent ce35a6d8be
commit 8b23c79294
4 changed files with 496 additions and 0 deletions
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -1804,4 +1804,278 @@ namespace cv { namespace gpu { namespace mathfunc
    template void sqrSumCaller<short>(const DevMem2D, PtrStep, double*, int);
    template void sqrSumCaller<int>(const DevMem2D, PtrStep, double*, int);
    template void sqrSumCaller<float>(const DevMem2D, PtrStep, double*, int);
+
+    //////////////////////////////////////////////////////////////////////////////
+    // reduce
+
+    template <typename S> struct SumReductor
+    {
+        __device__ __forceinline__ S startValue() const
+        {
+            return 0;
+        }
+
+        __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
+        {
+            return a + b;
+        }
+
+        __device__ __forceinline S result(S r, double) const
+        {
+            return r;
+        }
+    };
+
+    template <typename S> struct AvgReductor
+    {
+        __device__ __forceinline__ S startValue() const
+        {
+            return 0;
+        }
+
+        __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
+        {
+            return a + b;
+        }
+
+        __device__ __forceinline double result(S r, double sz) const
+        {
+            return r / sz;
+        }
+    };
+
+    template <typename S> struct MinReductor
+    {
+        __device__ __forceinline__ S startValue() const
+        {
+            return numeric_limits<S>::max();
+        }
+
+        template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const
+        {
+            return saturate_cast<T>(::min(a, b));
+        }
+        __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
+        {
+            return ::fmin(a, b);
+        }
+
+        __device__ __forceinline S result(S r, double) const
+        {
+            return r;
+        }
+    };
+
+    template <typename S> struct MaxReductor
+    {
+        __device__ __forceinline__ S startValue() const
+        {
+            return numeric_limits<S>::min();
+        }
+
+        template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const
+        {
+            return ::max(a, b);
+        }
+        __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
+        {
+            return ::fmax(a, b);
+        }
+
+        __device__ __forceinline S result(S r, double) const
+        {
+            return r;
+        }
+    };
+
+    template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)
+    {
+        __shared__ S smem[16 * 16];
+
+        const int x = blockIdx.x * 16 + threadIdx.x;
+
+        if (x < src.cols)
+        {
+            S myVal = op.startValue();
+
+            for (int y = threadIdx.y; y < src.rows; y += 16)
+                myVal = op(myVal, src.ptr(y)[x]);
+
+            smem[threadIdx.y * 16 + threadIdx.x] = myVal;
+            __syncthreads();
+
+            if (threadIdx.y == 0)
+            {
+                myVal = smem[threadIdx.x];
+
+                #pragma unroll
+                for (int i = 1; i < 16; ++i)
+                    myVal = op(myVal, smem[i * 16 + threadIdx.x]);
+
+                dst[x] = saturate_cast<D>(op.result(myVal, src.rows));
+            }
+        }
+    }
+
+    template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(src.cols, block.x));
+
+        Op<S> op;
+        reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+    }
+
+    template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
+
+        static const caller_t callers[] = 
+        {
+            reduceRows_caller<SumReductor, T, S, D>, 
+            reduceRows_caller<AvgReductor, T, S, D>, 
+            reduceRows_caller<MaxReductor, T, S, D>, 
+            reduceRows_caller<MinReductor, T, S, D>
+        };
+
+        callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+    }
+
+    template void reduceRows_gpu<uchar, int, uchar>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+    template void reduceRows_gpu<uchar, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+    template void reduceRows_gpu<uchar, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);  
+
+    template void reduceRows_gpu<ushort, int, ushort>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+    template void reduceRows_gpu<ushort, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+    template void reduceRows_gpu<ushort, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream); 
+
+    template void reduceRows_gpu<short, int, short>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+    template void reduceRows_gpu<short, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+    template void reduceRows_gpu<short, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream); 
+
+    template void reduceRows_gpu<int, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+    template void reduceRows_gpu<int, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+
+    template void reduceRows_gpu<float, float, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+
+
+
+
+    template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
+    {
+        __shared__ S smem[256 * cn];
+
+        const int y = blockIdx.x;
+
+        const T* src_row = src.ptr(y);
+
+        S myVal[cn];
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            myVal[c] = op.startValue();
+
+        for (int x = threadIdx.x; x < src.cols; x += 256)
+        {
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+                myVal[c] = op(myVal[c], src_row[x * cn + c]);
+        }
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            smem[c * 256 + threadIdx.x] = myVal[c];
+        __syncthreads();
+
+        if (threadIdx.x < 128)
+        {
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+                smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);
+        }
+        __syncthreads();
+
+        if (threadIdx.x < 64)
+        {
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+                smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);
+        }
+        __syncthreads();
+
+        volatile S* sdata = smem;
+
+        if (threadIdx.x < 32)
+        {
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+            {
+                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);
+                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);
+                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);
+                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);
+                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);
+                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);
+            }
+        }
+        __syncthreads();
+
+        if (threadIdx.x == 0)
+        {
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+                dst[y * cn + c] = saturate_cast<D>(op.result(smem[c * 256], src.cols));
+        }
+    }
+
+    template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+    {
+        const dim3 block(256);
+        const dim3 grid(src.rows);
+
+        Op<S> op;
+        reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+    }
+
+    template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
+
+        static const caller_t callers[4][4] = 
+        {
+            {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
+            {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
+            {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},
+            {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
+        };
+
+        callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+    }
+
+    template void reduceCols_gpu<uchar, int, uchar>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+    template void reduceCols_gpu<uchar, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+    template void reduceCols_gpu<uchar, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+
+    template void reduceCols_gpu<ushort, int, ushort>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream); 
+    template void reduceCols_gpu<ushort, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);                  
+    template void reduceCols_gpu<ushort, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+
+    template void reduceCols_gpu<short, int, short>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);  
+    template void reduceCols_gpu<short, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);                  
+    template void reduceCols_gpu<short, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);  
+
+    template void reduceCols_gpu<int, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);                  
+    template void reduceCols_gpu<int, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
+
+    template void reduceCols_gpu<float, float, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
 }}}