diff --git a/modules/cudaarithm/perf/perf_reductions.cpp b/modules/cudaarithm/perf/perf_reductions.cpp index fe7279556..aa79bf499 100644 --- a/modules/cudaarithm/perf/perf_reductions.cpp +++ b/modules/cudaarithm/perf/perf_reductions.cpp @@ -373,7 +373,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce, const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat dst; - TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp); + TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp, CV_32F); CUDA_SANITY_CHECK(dst); } @@ -381,7 +381,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce, { cv::Mat dst; - TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp); + TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp, CV_32F); CPU_SANITY_CHECK(dst); } diff --git a/modules/cudaarithm/src/cuda/reduce.cu b/modules/cudaarithm/src/cuda/reduce.cu index 2cc4a5b95..2cb2dacc7 100644 --- a/modules/cudaarithm/src/cuda/reduce.cu +++ b/modules/cudaarithm/src/cuda/reduce.cu @@ -40,301 +40,258 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/saturate_cast.hpp" -#include "opencv2/core/cuda/vec_traits.hpp" -#include "opencv2/core/cuda/vec_math.hpp" -#include "opencv2/core/cuda/functional.hpp" -#include "opencv2/core/cuda/reduce.hpp" -#include "opencv2/core/cuda/limits.hpp" +#ifndef HAVE_OPENCV_CUDEV -#include "unroll_detail.hpp" +#error "opencv_cudev is required" -using namespace cv::cuda; -using namespace cv::cuda::device; +#else -namespace reduce +#include "opencv2/cudaarithm.hpp" +#include "opencv2/cudev.hpp" + +using namespace cv::cudev; + +namespace { - struct Sum + template + void reduceToRowImpl(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream) { - template - __device__ __forceinline__ T startValue() const + const GpuMat_& src = (const GpuMat_&) _src; + GpuMat_& dst = (GpuMat_&) _dst; + + switch (reduceOp) { - return VecTraits::all(0); - } + case cv::REDUCE_SUM: + gridReduceToRow< Sum >(src, dst, stream); + break; - template - __device__ __forceinline__ T operator ()(T a, T b) const - { - return a + b; - } + case cv::REDUCE_AVG: + gridReduceToRow< Avg >(src, dst, stream); + break; - template - __device__ __forceinline__ T result(T r, int) const - { - return r; - } + case cv::REDUCE_MIN: + gridReduceToRow< Min >(src, dst, stream); + break; - __host__ __device__ __forceinline__ Sum() {} - __host__ __device__ __forceinline__ Sum(const Sum&) {} - }; - - template struct OutputType - { - typedef float type; - }; - template <> struct OutputType - { - typedef double type; - }; - - struct Avg - { - template - __device__ __forceinline__ T startValue() const - { - return VecTraits::all(0); - } - - template - __device__ __forceinline__ T operator ()(T a, T b) const - { - return a + b; - } - - template - __device__ __forceinline__ typename TypeVec::elem_type>::type, VecTraits::cn>::vec_type result(T r, float sz) const - { - return r / sz; - } - - __host__ __device__ __forceinline__ Avg() {} - __host__ __device__ __forceinline__ Avg(const Avg&) {} - }; - - struct Min - { - template - __device__ __forceinline__ T startValue() const - { - return VecTraits::all(numeric_limits::elem_type>::max()); - } - - template - __device__ __forceinline__ T operator ()(T a, T b) const - { - minimum minOp; - return minOp(a, b); - } - - template - __device__ __forceinline__ T result(T r, int) const - { - return r; - } - - __host__ __device__ __forceinline__ Min() {} - __host__ __device__ __forceinline__ Min(const Min&) {} - }; - - struct Max - { - template - __device__ __forceinline__ T startValue() const - { - return VecTraits::all(-numeric_limits::elem_type>::max()); - } - - template - __device__ __forceinline__ T operator ()(T a, T b) const - { - maximum maxOp; - return maxOp(a, b); - } - - template - __device__ __forceinline__ T result(T r, int) const - { - return r; - } - - __host__ __device__ __forceinline__ Max() {} - __host__ __device__ __forceinline__ Max(const Max&) {} - }; - - /////////////////////////////////////////////////////////// - - template - __global__ void rowsKernel(const PtrStepSz src, D* dst, const Op op) - { - __shared__ S smem[16 * 16]; - - const int x = blockIdx.x * 16 + threadIdx.x; - - S myVal = op.template startValue(); - - if (x < src.cols) - { - for (int y = threadIdx.y; y < src.rows; y += 16) - { - S srcVal = src(y, x); - myVal = op(myVal, srcVal); - } - } - - smem[threadIdx.x * 16 + threadIdx.y] = myVal; - - __syncthreads(); - - volatile S* srow = smem + threadIdx.y * 16; - - myVal = srow[threadIdx.x]; - device::reduce<16>(srow, myVal, threadIdx.x, op); - - if (threadIdx.x == 0) - srow[0] = myVal; - - __syncthreads(); - - if (threadIdx.y == 0 && x < src.cols) - dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows); - } - - template - void rowsCaller(PtrStepSz src, D* dst, cudaStream_t stream) - { - const dim3 block(16, 16); - const dim3 grid(divUp(src.cols, block.x)); - - Op op; - rowsKernel<<>>(src, dst, op); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + case cv::REDUCE_MAX: + gridReduceToRow< Max >(src, dst, stream); + break; + }; } template - void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream) + void reduceToColumnImpl_(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream) { - typedef void (*func_t)(PtrStepSz src, D* dst, cudaStream_t stream); - static const func_t funcs[] = + const GpuMat_& src = (const GpuMat_&) _src; + GpuMat_& dst = (GpuMat_&) _dst; + + switch (reduceOp) { - rowsCaller, - rowsCaller, - rowsCaller, - rowsCaller + case cv::REDUCE_SUM: + gridReduceToColumn< Sum >(src, dst, stream); + break; + + case cv::REDUCE_AVG: + gridReduceToColumn< Avg >(src, dst, stream); + break; + + case cv::REDUCE_MIN: + gridReduceToColumn< Min >(src, dst, stream); + break; + + case cv::REDUCE_MAX: + gridReduceToColumn< Max >(src, dst, stream); + break; + }; + } + + template + void reduceToColumnImpl(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream) + { + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream); + static const func_t funcs[4] = + { + reduceToColumnImpl_, + reduceToColumnImpl_::type, typename MakeVec::type, typename MakeVec::type>, + reduceToColumnImpl_::type, typename MakeVec::type, typename MakeVec::type>, + reduceToColumnImpl_::type, typename MakeVec::type, typename MakeVec::type> }; - funcs[op]((PtrStepSz) src, (D*) dst, stream); + funcs[src.channels() - 1](src, dst, reduceOp, stream); } - - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - - template void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - - /////////////////////////////////////////////////////////// - - template - __global__ void colsKernel(const PtrStepSz::vec_type> src, typename TypeVec::vec_type* dst, const Op op) - { - typedef typename TypeVec::vec_type src_type; - typedef typename TypeVec::vec_type work_type; - typedef typename TypeVec::vec_type dst_type; - - __shared__ S smem[BLOCK_SIZE * cn]; - - const int y = blockIdx.x; - - const src_type* srcRow = src.ptr(y); - - work_type myVal = op.template startValue(); - - for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE) - myVal = op(myVal, saturate_cast(srcRow[x])); - - device::reduce(detail::Unroll::template smem_tuple(smem), detail::Unroll::tie(myVal), threadIdx.x, detail::Unroll::op(op)); - - if (threadIdx.x == 0) - dst[y] = saturate_cast(op.result(myVal, src.cols)); - } - - template void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream) - { - const int BLOCK_SIZE = 256; - - const dim3 block(BLOCK_SIZE); - const dim3 grid(src.rows); - - Op op; - colsKernel<<>>((PtrStepSz::vec_type>) src, (typename TypeVec::vec_type*) dst, op); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - - } - - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream) - { - typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream); - static const func_t funcs[5][4] = - { - {0,0,0,0}, - {colsCaller, colsCaller, colsCaller, colsCaller}, - {colsCaller, colsCaller, colsCaller, colsCaller}, - {colsCaller, colsCaller, colsCaller, colsCaller}, - {colsCaller, colsCaller, colsCaller, colsCaller}, - }; - - funcs[cn][op](src, dst, stream); - } - - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - - template void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); } -#endif /* CUDA_DISABLER */ +void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream) +{ + GpuMat src = _src.getGpuMat(); + + CV_Assert( src.channels() <= 4 ); + CV_Assert( dim == 0 || dim == 1 ); + CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN ); + + if (dtype < 0) + dtype = src.depth(); + + _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); + GpuMat dst = _dst.getGpuMat(); + + if (dim == 0) + { + typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream); + static const func_t funcs[7][7] = + { + { + reduceToRowImpl, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + reduceToRowImpl, + reduceToRowImpl, + reduceToRowImpl + }, + { + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/ + }, + { + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + reduceToRowImpl, + 0 /*reduceToRowImpl*/, + reduceToRowImpl, + reduceToRowImpl, + reduceToRowImpl + }, + { + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + reduceToRowImpl, + reduceToRowImpl, + reduceToRowImpl, + reduceToRowImpl + }, + { + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + reduceToRowImpl, + reduceToRowImpl, + reduceToRowImpl + }, + { + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + reduceToRowImpl, + reduceToRowImpl + }, + { + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + 0 /*reduceToRowImpl*/, + reduceToRowImpl + } + }; + + const func_t func = funcs[src.depth()][dst.depth()]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats"); + + GpuMat dst_cont = dst.reshape(1); + func(src.reshape(1), dst_cont, reduceOp, stream); + } + else + { + typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream); + static const func_t funcs[7][7] = + { + { + reduceToColumnImpl, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + reduceToColumnImpl, + reduceToColumnImpl, + reduceToColumnImpl + }, + { + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/ + }, + { + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + reduceToColumnImpl, + 0 /*reduceToColumnImpl*/, + reduceToColumnImpl, + reduceToColumnImpl, + reduceToColumnImpl + }, + { + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + reduceToColumnImpl, + reduceToColumnImpl, + reduceToColumnImpl, + reduceToColumnImpl + }, + { + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + reduceToColumnImpl, + reduceToColumnImpl, + reduceToColumnImpl + }, + { + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + reduceToColumnImpl, + reduceToColumnImpl + }, + { + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + 0 /*reduceToColumnImpl*/, + reduceToColumnImpl + } + }; + + const func_t func = funcs[src.depth()][dst.depth()]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats"); + + func(src, dst, reduceOp, stream); + } +} + +#endif diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index d5cba336a..81307f4d5 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -186,188 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT return retVal; } -////////////////////////////////////////////////////////////////////////////// -// reduce - -namespace reduce -{ - template - void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - - template - void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); -} - -void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream) -{ - GpuMat src = _src.getGpuMat(); - - CV_Assert( src.channels() <= 4 ); - CV_Assert( dim == 0 || dim == 1 ); - CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN ); - - if (dtype < 0) - dtype = src.depth(); - - _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); - GpuMat dst = _dst.getGpuMat(); - - if (dim == 0) - { - typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - ::reduce::rows, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - ::reduce::rows, - ::reduce::rows, - ::reduce::rows - }, - { - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/ - }, - { - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - ::reduce::rows, - 0/*::reduce::rows*/, - ::reduce::rows, - ::reduce::rows, - ::reduce::rows - }, - { - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - ::reduce::rows, - ::reduce::rows, - ::reduce::rows, - ::reduce::rows - }, - { - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - ::reduce::rows, - ::reduce::rows, - ::reduce::rows - }, - { - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - ::reduce::rows, - ::reduce::rows - }, - { - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - 0/*::reduce::rows*/, - ::reduce::rows - } - }; - - const func_t func = funcs[src.depth()][dst.depth()]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats"); - - func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream)); - } - else - { - typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream); - static const func_t funcs[7][7] = - { - { - ::reduce::cols, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - ::reduce::cols, - ::reduce::cols, - ::reduce::cols - }, - { - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/ - }, - { - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - ::reduce::cols, - 0/*::reduce::cols*/, - ::reduce::cols, - ::reduce::cols, - ::reduce::cols - }, - { - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - ::reduce::cols, - ::reduce::cols, - ::reduce::cols, - ::reduce::cols - }, - { - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - ::reduce::cols, - ::reduce::cols, - ::reduce::cols - }, - { - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - ::reduce::cols, - ::reduce::cols - }, - { - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - 0/*::reduce::cols*/, - ::reduce::cols - } - }; - - const func_t func = funcs[src.depth()][dst.depth()]; - - if (!func) - CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats"); - - func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream)); - } -} - //////////////////////////////////////////////////////////////////////// // meanStdDev diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp index b257e7503..c4852949e 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp @@ -54,12 +54,52 @@ namespace cv { namespace cudev { namespace grid_reduce_to_vec_detail { + template struct Reduce; + + template struct Reduce + { + __device__ __forceinline__ static void call(work_elem_type smem[1][BLOCK_SIZE], work_type& myVal) + { + typename Reductor::template rebind::other op; + blockReduce(smem[0], myVal, threadIdx.x, op); + } + }; + + template struct Reduce + { + __device__ __forceinline__ static void call(work_elem_type smem[2][BLOCK_SIZE], work_type& myVal) + { + typename Reductor::template rebind::other op; + blockReduce(smem_tuple(smem[0], smem[1]), tie(myVal.x, myVal.y), threadIdx.x, make_tuple(op, op)); + } + }; + + template struct Reduce + { + __device__ __forceinline__ static void call(work_elem_type smem[3][BLOCK_SIZE], work_type& myVal) + { + typename Reductor::template rebind::other op; + blockReduce(smem_tuple(smem[0], smem[1], smem[2]), tie(myVal.x, myVal.y, myVal.z), threadIdx.x, make_tuple(op, op, op)); + } + }; + + template struct Reduce + { + __device__ __forceinline__ static void call(work_elem_type smem[4][BLOCK_SIZE], work_type& myVal) + { + typename Reductor::template rebind::other op; + blockReduce(smem_tuple(smem[0], smem[1], smem[2], smem[3]), tie(myVal.x, myVal.y, myVal.z, myVal.w), threadIdx.x, make_tuple(op, op, op, op)); + } + }; + template __global__ void reduceToColumn(const SrcPtr src, ResType* dst, const MaskPtr mask, const int cols) { typedef typename Reductor::work_type work_type; + typedef typename VecTraits::elem_type work_elem_type; + const int cn = VecTraits::cn; - __shared__ work_type smem[BLOCK_SIZE]; + __shared__ work_elem_type smem[cn][BLOCK_SIZE]; const int y = blockIdx.x; @@ -75,7 +115,7 @@ namespace grid_reduce_to_vec_detail } } - blockReduce(smem, myVal, threadIdx.x, op); + Reduce::call(smem, myVal); if (threadIdx.x == 0) dst[y] = saturate_cast(Reductor::result(myVal, cols)); diff --git a/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp index f9e351242..361d40d1c 100644 --- a/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp @@ -49,6 +49,7 @@ #include "../common.hpp" #include "../util/vec_traits.hpp" #include "../util/limits.hpp" +#include "../util/saturate_cast.hpp" #include "../ptr2d/traits.hpp" #include "../ptr2d/gpumat.hpp" #include "../ptr2d/mask.hpp" @@ -62,6 +63,11 @@ template struct Sum : plus { typedef T work_type; + template struct rebind + { + typedef Sum other; + }; + __device__ __forceinline__ static T initialValue() { return VecTraits::all(0); @@ -77,14 +83,19 @@ template struct Avg : plus { typedef T work_type; + template struct rebind + { + typedef Avg other; + }; + __device__ __forceinline__ static T initialValue() { return VecTraits::all(0); } - __device__ __forceinline__ static T result(T r, int sz) + __device__ __forceinline__ static T result(T r, float sz) { - return r / sz; + return saturate_cast(r / sz); } }; @@ -92,6 +103,11 @@ template struct Min : minimum { typedef T work_type; + template struct rebind + { + typedef Min other; + }; + __device__ __forceinline__ static T initialValue() { return VecTraits::all(numeric_limits::elem_type>::max()); @@ -107,6 +123,11 @@ template struct Max : maximum { typedef T work_type; + template struct rebind + { + typedef Max other; + }; + __device__ __forceinline__ static T initialValue() { return VecTraits::all(-numeric_limits::elem_type>::max()); @@ -158,7 +179,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_& dst, cons CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); - createContinuous(rows, 1, DataType::type, dst); + dst.create(1, rows); grid_reduce_to_vec_detail::reduceToColumn(shrinkPtr(src), dst[0], @@ -173,7 +194,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_& dst, Stre const int rows = getRows(src); const int cols = getCols(src); - createContinuous(rows, 1, DataType::type, dst); + dst.create(1, rows); grid_reduce_to_vec_detail::reduceToColumn(shrinkPtr(src), dst[0], diff --git a/modules/cudev/test/test_reduction.cu b/modules/cudev/test/test_reduction.cu index 03c78def1..c37605987 100644 --- a/modules/cudev/test/test_reduction.cu +++ b/modules/cudev/test/test_reduction.cu @@ -228,6 +228,9 @@ TEST(ReduceToColumn, Sum) Mat dst_gold; cv::reduce(src, dst_gold, 1, REDUCE_SUM, CV_32S); + dst_gold.cols = dst_gold.rows; + dst_gold.rows = 1; + dst_gold.step = dst_gold.cols * dst_gold.elemSize(); EXPECT_MAT_NEAR(dst_gold, dst, 0.0); } @@ -244,6 +247,9 @@ TEST(ReduceToColumn, Avg) Mat dst_gold; cv::reduce(src, dst_gold, 1, REDUCE_AVG, CV_32F); + dst_gold.cols = dst_gold.rows; + dst_gold.rows = 1; + dst_gold.step = dst_gold.cols * dst_gold.elemSize(); EXPECT_MAT_NEAR(dst_gold, dst, 1e-4); } @@ -260,6 +266,9 @@ TEST(ReduceToColumn, Min) Mat dst_gold; cv::reduce(src, dst_gold, 1, REDUCE_MIN); + dst_gold.cols = dst_gold.rows; + dst_gold.rows = 1; + dst_gold.step = dst_gold.cols * dst_gold.elemSize(); EXPECT_MAT_NEAR(dst_gold, dst, 0.0); } @@ -276,6 +285,9 @@ TEST(ReduceToColumn, Max) Mat dst_gold; cv::reduce(src, dst_gold, 1, REDUCE_MAX); + dst_gold.cols = dst_gold.rows; + dst_gold.rows = 1; + dst_gold.step = dst_gold.cols * dst_gold.elemSize(); EXPECT_MAT_NEAR(dst_gold, dst, 0.0); }