used new device layer for cv::gpu::reduce
This commit is contained in:
parent
31a7814395
commit
1ef211b889
@ -373,7 +373,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
|
||||
const cv::cuda::GpuMat d_src(src);
|
||||
cv::cuda::GpuMat dst;
|
||||
|
||||
TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp);
|
||||
TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp, CV_32F);
|
||||
|
||||
CUDA_SANITY_CHECK(dst);
|
||||
}
|
||||
@ -381,7 +381,7 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
|
||||
{
|
||||
cv::Mat dst;
|
||||
|
||||
TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp);
|
||||
TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp, CV_32F);
|
||||
|
||||
CPU_SANITY_CHECK(dst);
|
||||
}
|
||||
|
@ -40,301 +40,258 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/core/cuda/common.hpp"
|
||||
#include "opencv2/core/cuda/saturate_cast.hpp"
|
||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
||||
#include "opencv2/core/cuda/vec_math.hpp"
|
||||
#include "opencv2/core/cuda/functional.hpp"
|
||||
#include "opencv2/core/cuda/reduce.hpp"
|
||||
#include "opencv2/core/cuda/limits.hpp"
|
||||
#ifndef HAVE_OPENCV_CUDEV
|
||||
|
||||
#include "unroll_detail.hpp"
|
||||
#error "opencv_cudev is required"
|
||||
|
||||
using namespace cv::cuda;
|
||||
using namespace cv::cuda::device;
|
||||
#else
|
||||
|
||||
namespace reduce
|
||||
#include "opencv2/cudaarithm.hpp"
|
||||
#include "opencv2/cudev.hpp"
|
||||
|
||||
using namespace cv::cudev;
|
||||
|
||||
namespace
|
||||
{
|
||||
struct Sum
|
||||
template <typename T, typename S, typename D>
|
||||
void reduceToRowImpl(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T startValue() const
|
||||
{
|
||||
return VecTraits<T>::all(0);
|
||||
}
|
||||
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
|
||||
GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T operator ()(T a, T b) const
|
||||
switch (reduceOp)
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
case cv::REDUCE_SUM:
|
||||
gridReduceToRow< Sum<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T result(T r, int) const
|
||||
{
|
||||
return r;
|
||||
}
|
||||
case cv::REDUCE_AVG:
|
||||
gridReduceToRow< Avg<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
__host__ __device__ __forceinline__ Sum() {}
|
||||
__host__ __device__ __forceinline__ Sum(const Sum&) {}
|
||||
case cv::REDUCE_MIN:
|
||||
gridReduceToRow< Min<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
case cv::REDUCE_MAX:
|
||||
gridReduceToRow< Max<S> >(src, dst, stream);
|
||||
break;
|
||||
};
|
||||
|
||||
template <typename T> struct OutputType
|
||||
{
|
||||
typedef float type;
|
||||
};
|
||||
template <> struct OutputType<double>
|
||||
{
|
||||
typedef double type;
|
||||
};
|
||||
|
||||
struct Avg
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T startValue() const
|
||||
{
|
||||
return VecTraits<T>::all(0);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T operator ()(T a, T b) const
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ typename TypeVec<typename OutputType<typename VecTraits<T>::elem_type>::type, VecTraits<T>::cn>::vec_type result(T r, float sz) const
|
||||
{
|
||||
return r / sz;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Avg() {}
|
||||
__host__ __device__ __forceinline__ Avg(const Avg&) {}
|
||||
};
|
||||
|
||||
struct Min
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T startValue() const
|
||||
{
|
||||
return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T operator ()(T a, T b) const
|
||||
{
|
||||
minimum<T> minOp;
|
||||
return minOp(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T result(T r, int) const
|
||||
{
|
||||
return r;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Min() {}
|
||||
__host__ __device__ __forceinline__ Min(const Min&) {}
|
||||
};
|
||||
|
||||
struct Max
|
||||
{
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T startValue() const
|
||||
{
|
||||
return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T operator ()(T a, T b) const
|
||||
{
|
||||
maximum<T> maxOp;
|
||||
return maxOp(a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T result(T r, int) const
|
||||
{
|
||||
return r;
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ Max() {}
|
||||
__host__ __device__ __forceinline__ Max(const Max&) {}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename S, typename D, class Op>
|
||||
__global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
|
||||
{
|
||||
__shared__ S smem[16 * 16];
|
||||
|
||||
const int x = blockIdx.x * 16 + threadIdx.x;
|
||||
|
||||
S myVal = op.template startValue<S>();
|
||||
|
||||
if (x < src.cols)
|
||||
{
|
||||
for (int y = threadIdx.y; y < src.rows; y += 16)
|
||||
{
|
||||
S srcVal = src(y, x);
|
||||
myVal = op(myVal, srcVal);
|
||||
}
|
||||
}
|
||||
|
||||
smem[threadIdx.x * 16 + threadIdx.y] = myVal;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
volatile S* srow = smem + threadIdx.y * 16;
|
||||
|
||||
myVal = srow[threadIdx.x];
|
||||
device::reduce<16>(srow, myVal, threadIdx.x, op);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
srow[0] = myVal;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.y == 0 && x < src.cols)
|
||||
dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D, class Op>
|
||||
void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(src.cols, block.x));
|
||||
|
||||
Op op;
|
||||
rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D>
|
||||
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
|
||||
void reduceToColumnImpl_(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
|
||||
GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
|
||||
|
||||
switch (reduceOp)
|
||||
{
|
||||
rowsCaller<T, S, D, Sum>,
|
||||
rowsCaller<T, S, D, Avg>,
|
||||
rowsCaller<T, S, D, Max>,
|
||||
rowsCaller<T, S, D, Min>
|
||||
case cv::REDUCE_SUM:
|
||||
gridReduceToColumn< Sum<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
case cv::REDUCE_AVG:
|
||||
gridReduceToColumn< Avg<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
case cv::REDUCE_MIN:
|
||||
gridReduceToColumn< Min<S> >(src, dst, stream);
|
||||
break;
|
||||
|
||||
case cv::REDUCE_MAX:
|
||||
gridReduceToColumn< Max<S> >(src, dst, stream);
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D>
|
||||
void reduceToColumnImpl(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream);
|
||||
static const func_t funcs[4] =
|
||||
{
|
||||
reduceToColumnImpl_<T, S, D>,
|
||||
reduceToColumnImpl_<typename MakeVec<T, 2>::type, typename MakeVec<S, 2>::type, typename MakeVec<D, 2>::type>,
|
||||
reduceToColumnImpl_<typename MakeVec<T, 3>::type, typename MakeVec<S, 3>::type, typename MakeVec<D, 3>::type>,
|
||||
reduceToColumnImpl_<typename MakeVec<T, 4>::type, typename MakeVec<S, 4>::type, typename MakeVec<D, 4>::type>
|
||||
};
|
||||
|
||||
funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
|
||||
funcs[src.channels() - 1](src, dst, reduceOp, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
|
||||
__global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
|
||||
void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
|
||||
{
|
||||
typedef typename TypeVec<T, cn>::vec_type src_type;
|
||||
typedef typename TypeVec<S, cn>::vec_type work_type;
|
||||
typedef typename TypeVec<D, cn>::vec_type dst_type;
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
__shared__ S smem[BLOCK_SIZE * cn];
|
||||
CV_Assert( src.channels() <= 4 );
|
||||
CV_Assert( dim == 0 || dim == 1 );
|
||||
CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
|
||||
|
||||
const int y = blockIdx.x;
|
||||
if (dtype < 0)
|
||||
dtype = src.depth();
|
||||
|
||||
const src_type* srcRow = src.ptr(y);
|
||||
_dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
work_type myVal = op.template startValue<work_type>();
|
||||
|
||||
for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
|
||||
myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
|
||||
|
||||
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
|
||||
if (dim == 0)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
reduceToRowImpl<uchar, int, uchar>,
|
||||
0 /*reduceToRowImpl<uchar, int, schar>*/,
|
||||
0 /*reduceToRowImpl<uchar, int, ushort>*/,
|
||||
0 /*reduceToRowImpl<uchar, int, short>*/,
|
||||
reduceToRowImpl<uchar, int, int>,
|
||||
reduceToRowImpl<uchar, float, float>,
|
||||
reduceToRowImpl<uchar, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<schar, int, uchar>*/,
|
||||
0 /*reduceToRowImpl<schar, int, schar>*/,
|
||||
0 /*reduceToRowImpl<schar, int, ushort>*/,
|
||||
0 /*reduceToRowImpl<schar, int, short>*/,
|
||||
0 /*reduceToRowImpl<schar, int, int>*/,
|
||||
0 /*reduceToRowImpl<schar, float, float>*/,
|
||||
0 /*reduceToRowImpl<schar, double, double>*/
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<ushort, int, uchar>*/,
|
||||
0 /*reduceToRowImpl<ushort, int, schar>*/,
|
||||
reduceToRowImpl<ushort, int, ushort>,
|
||||
0 /*reduceToRowImpl<ushort, int, short>*/,
|
||||
reduceToRowImpl<ushort, int, int>,
|
||||
reduceToRowImpl<ushort, float, float>,
|
||||
reduceToRowImpl<ushort, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<short, int, uchar>*/,
|
||||
0 /*reduceToRowImpl<short, int, schar>*/,
|
||||
0 /*reduceToRowImpl<short, int, ushort>*/,
|
||||
reduceToRowImpl<short, int, short>,
|
||||
reduceToRowImpl<short, int, int>,
|
||||
reduceToRowImpl<short, float, float>,
|
||||
reduceToRowImpl<short, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<int, int, uchar>*/,
|
||||
0 /*reduceToRowImpl<int, int, schar>*/,
|
||||
0 /*reduceToRowImpl<int, int, ushort>*/,
|
||||
0 /*reduceToRowImpl<int, int, short>*/,
|
||||
reduceToRowImpl<int, int, int>,
|
||||
reduceToRowImpl<int, float, float>,
|
||||
reduceToRowImpl<int, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<float, float, uchar>*/,
|
||||
0 /*reduceToRowImpl<float, float, schar>*/,
|
||||
0 /*reduceToRowImpl<float, float, ushort>*/,
|
||||
0 /*reduceToRowImpl<float, float, short>*/,
|
||||
0 /*reduceToRowImpl<float, float, int>*/,
|
||||
reduceToRowImpl<float, float, float>,
|
||||
reduceToRowImpl<float, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToRowImpl<double, double, uchar>*/,
|
||||
0 /*reduceToRowImpl<double, double, schar>*/,
|
||||
0 /*reduceToRowImpl<double, double, ushort>*/,
|
||||
0 /*reduceToRowImpl<double, double, short>*/,
|
||||
0 /*reduceToRowImpl<double, double, int>*/,
|
||||
0 /*reduceToRowImpl<double, double, float>*/,
|
||||
reduceToRowImpl<double, double, double>
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
|
||||
{
|
||||
const int BLOCK_SIZE = 256;
|
||||
|
||||
const dim3 block(BLOCK_SIZE);
|
||||
const dim3 grid(src.rows);
|
||||
|
||||
Op op;
|
||||
colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
|
||||
static const func_t funcs[5][4] =
|
||||
{
|
||||
{0,0,0,0},
|
||||
{colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
|
||||
{colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
|
||||
{colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
|
||||
{colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
|
||||
};
|
||||
|
||||
funcs[cn][op](src, dst, stream);
|
||||
const func_t func = funcs[src.depth()][dst.depth()];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
GpuMat dst_cont = dst.reshape(1);
|
||||
func(src.reshape(1), dst_cont, reduceOp, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
reduceToColumnImpl<uchar, int, uchar>,
|
||||
0 /*reduceToColumnImpl<uchar, int, schar>*/,
|
||||
0 /*reduceToColumnImpl<uchar, int, ushort>*/,
|
||||
0 /*reduceToColumnImpl<uchar, int, short>*/,
|
||||
reduceToColumnImpl<uchar, int, int>,
|
||||
reduceToColumnImpl<uchar, float, float>,
|
||||
reduceToColumnImpl<uchar, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<schar, int, uchar>*/,
|
||||
0 /*reduceToColumnImpl<schar, int, schar>*/,
|
||||
0 /*reduceToColumnImpl<schar, int, ushort>*/,
|
||||
0 /*reduceToColumnImpl<schar, int, short>*/,
|
||||
0 /*reduceToColumnImpl<schar, int, int>*/,
|
||||
0 /*reduceToColumnImpl<schar, float, float>*/,
|
||||
0 /*reduceToColumnImpl<schar, double, double>*/
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<ushort, int, uchar>*/,
|
||||
0 /*reduceToColumnImpl<ushort, int, schar>*/,
|
||||
reduceToColumnImpl<ushort, int, ushort>,
|
||||
0 /*reduceToColumnImpl<ushort, int, short>*/,
|
||||
reduceToColumnImpl<ushort, int, int>,
|
||||
reduceToColumnImpl<ushort, float, float>,
|
||||
reduceToColumnImpl<ushort, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<short, int, uchar>*/,
|
||||
0 /*reduceToColumnImpl<short, int, schar>*/,
|
||||
0 /*reduceToColumnImpl<short, int, ushort>*/,
|
||||
reduceToColumnImpl<short, int, short>,
|
||||
reduceToColumnImpl<short, int, int>,
|
||||
reduceToColumnImpl<short, float, float>,
|
||||
reduceToColumnImpl<short, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<int, int, uchar>*/,
|
||||
0 /*reduceToColumnImpl<int, int, schar>*/,
|
||||
0 /*reduceToColumnImpl<int, int, ushort>*/,
|
||||
0 /*reduceToColumnImpl<int, int, short>*/,
|
||||
reduceToColumnImpl<int, int, int>,
|
||||
reduceToColumnImpl<int, float, float>,
|
||||
reduceToColumnImpl<int, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<float, float, uchar>*/,
|
||||
0 /*reduceToColumnImpl<float, float, schar>*/,
|
||||
0 /*reduceToColumnImpl<float, float, ushort>*/,
|
||||
0 /*reduceToColumnImpl<float, float, short>*/,
|
||||
0 /*reduceToColumnImpl<float, float, int>*/,
|
||||
reduceToColumnImpl<float, float, float>,
|
||||
reduceToColumnImpl<float, double, double>
|
||||
},
|
||||
{
|
||||
0 /*reduceToColumnImpl<double, double, uchar>*/,
|
||||
0 /*reduceToColumnImpl<double, double, schar>*/,
|
||||
0 /*reduceToColumnImpl<double, double, ushort>*/,
|
||||
0 /*reduceToColumnImpl<double, double, short>*/,
|
||||
0 /*reduceToColumnImpl<double, double, int>*/,
|
||||
0 /*reduceToColumnImpl<double, double, float>*/,
|
||||
reduceToColumnImpl<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const func_t func = funcs[src.depth()][dst.depth()];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
func(src, dst, reduceOp, stream);
|
||||
}
|
||||
}
|
||||
|
||||
template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
|
||||
template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif
|
||||
|
@ -186,188 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
|
||||
return retVal;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// reduce
|
||||
|
||||
namespace reduce
|
||||
{
|
||||
template <typename T, typename S, typename D>
|
||||
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename S, typename D>
|
||||
void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
}
|
||||
|
||||
void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
|
||||
{
|
||||
GpuMat src = _src.getGpuMat();
|
||||
|
||||
CV_Assert( src.channels() <= 4 );
|
||||
CV_Assert( dim == 0 || dim == 1 );
|
||||
CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
|
||||
|
||||
if (dtype < 0)
|
||||
dtype = src.depth();
|
||||
|
||||
_dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
|
||||
GpuMat dst = _dst.getGpuMat();
|
||||
|
||||
if (dim == 0)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
::reduce::rows<unsigned char, int, unsigned char>,
|
||||
0/*::reduce::rows<unsigned char, int, signed char>*/,
|
||||
0/*::reduce::rows<unsigned char, int, unsigned short>*/,
|
||||
0/*::reduce::rows<unsigned char, int, short>*/,
|
||||
::reduce::rows<unsigned char, int, int>,
|
||||
::reduce::rows<unsigned char, float, float>,
|
||||
::reduce::rows<unsigned char, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<signed char, int, unsigned char>*/,
|
||||
0/*::reduce::rows<signed char, int, signed char>*/,
|
||||
0/*::reduce::rows<signed char, int, unsigned short>*/,
|
||||
0/*::reduce::rows<signed char, int, short>*/,
|
||||
0/*::reduce::rows<signed char, int, int>*/,
|
||||
0/*::reduce::rows<signed char, float, float>*/,
|
||||
0/*::reduce::rows<signed char, double, double>*/
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<unsigned short, int, unsigned char>*/,
|
||||
0/*::reduce::rows<unsigned short, int, signed char>*/,
|
||||
::reduce::rows<unsigned short, int, unsigned short>,
|
||||
0/*::reduce::rows<unsigned short, int, short>*/,
|
||||
::reduce::rows<unsigned short, int, int>,
|
||||
::reduce::rows<unsigned short, float, float>,
|
||||
::reduce::rows<unsigned short, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<short, int, unsigned char>*/,
|
||||
0/*::reduce::rows<short, int, signed char>*/,
|
||||
0/*::reduce::rows<short, int, unsigned short>*/,
|
||||
::reduce::rows<short, int, short>,
|
||||
::reduce::rows<short, int, int>,
|
||||
::reduce::rows<short, float, float>,
|
||||
::reduce::rows<short, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<int, int, unsigned char>*/,
|
||||
0/*::reduce::rows<int, int, signed char>*/,
|
||||
0/*::reduce::rows<int, int, unsigned short>*/,
|
||||
0/*::reduce::rows<int, int, short>*/,
|
||||
::reduce::rows<int, int, int>,
|
||||
::reduce::rows<int, float, float>,
|
||||
::reduce::rows<int, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<float, float, unsigned char>*/,
|
||||
0/*::reduce::rows<float, float, signed char>*/,
|
||||
0/*::reduce::rows<float, float, unsigned short>*/,
|
||||
0/*::reduce::rows<float, float, short>*/,
|
||||
0/*::reduce::rows<float, float, int>*/,
|
||||
::reduce::rows<float, float, float>,
|
||||
::reduce::rows<float, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::rows<double, double, unsigned char>*/,
|
||||
0/*::reduce::rows<double, double, signed char>*/,
|
||||
0/*::reduce::rows<double, double, unsigned short>*/,
|
||||
0/*::reduce::rows<double, double, short>*/,
|
||||
0/*::reduce::rows<double, double, int>*/,
|
||||
0/*::reduce::rows<double, double, float>*/,
|
||||
::reduce::rows<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const func_t func = funcs[src.depth()][dst.depth()];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
|
||||
}
|
||||
else
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
|
||||
static const func_t funcs[7][7] =
|
||||
{
|
||||
{
|
||||
::reduce::cols<unsigned char, int, unsigned char>,
|
||||
0/*::reduce::cols<unsigned char, int, signed char>*/,
|
||||
0/*::reduce::cols<unsigned char, int, unsigned short>*/,
|
||||
0/*::reduce::cols<unsigned char, int, short>*/,
|
||||
::reduce::cols<unsigned char, int, int>,
|
||||
::reduce::cols<unsigned char, float, float>,
|
||||
::reduce::cols<unsigned char, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<signed char, int, unsigned char>*/,
|
||||
0/*::reduce::cols<signed char, int, signed char>*/,
|
||||
0/*::reduce::cols<signed char, int, unsigned short>*/,
|
||||
0/*::reduce::cols<signed char, int, short>*/,
|
||||
0/*::reduce::cols<signed char, int, int>*/,
|
||||
0/*::reduce::cols<signed char, float, float>*/,
|
||||
0/*::reduce::cols<signed char, double, double>*/
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<unsigned short, int, unsigned char>*/,
|
||||
0/*::reduce::cols<unsigned short, int, signed char>*/,
|
||||
::reduce::cols<unsigned short, int, unsigned short>,
|
||||
0/*::reduce::cols<unsigned short, int, short>*/,
|
||||
::reduce::cols<unsigned short, int, int>,
|
||||
::reduce::cols<unsigned short, float, float>,
|
||||
::reduce::cols<unsigned short, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<short, int, unsigned char>*/,
|
||||
0/*::reduce::cols<short, int, signed char>*/,
|
||||
0/*::reduce::cols<short, int, unsigned short>*/,
|
||||
::reduce::cols<short, int, short>,
|
||||
::reduce::cols<short, int, int>,
|
||||
::reduce::cols<short, float, float>,
|
||||
::reduce::cols<short, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<int, int, unsigned char>*/,
|
||||
0/*::reduce::cols<int, int, signed char>*/,
|
||||
0/*::reduce::cols<int, int, unsigned short>*/,
|
||||
0/*::reduce::cols<int, int, short>*/,
|
||||
::reduce::cols<int, int, int>,
|
||||
::reduce::cols<int, float, float>,
|
||||
::reduce::cols<int, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<float, float, unsigned char>*/,
|
||||
0/*::reduce::cols<float, float, signed char>*/,
|
||||
0/*::reduce::cols<float, float, unsigned short>*/,
|
||||
0/*::reduce::cols<float, float, short>*/,
|
||||
0/*::reduce::cols<float, float, int>*/,
|
||||
::reduce::cols<float, float, float>,
|
||||
::reduce::cols<float, double, double>
|
||||
},
|
||||
{
|
||||
0/*::reduce::cols<double, double, unsigned char>*/,
|
||||
0/*::reduce::cols<double, double, signed char>*/,
|
||||
0/*::reduce::cols<double, double, unsigned short>*/,
|
||||
0/*::reduce::cols<double, double, short>*/,
|
||||
0/*::reduce::cols<double, double, int>*/,
|
||||
0/*::reduce::cols<double, double, float>*/,
|
||||
::reduce::cols<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
const func_t func = funcs[src.depth()][dst.depth()];
|
||||
|
||||
if (!func)
|
||||
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// meanStdDev
|
||||
|
||||
|
@ -54,12 +54,52 @@ namespace cv { namespace cudev {
|
||||
|
||||
namespace grid_reduce_to_vec_detail
|
||||
{
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor, int cn> struct Reduce;
|
||||
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 1>
|
||||
{
|
||||
__device__ __forceinline__ static void call(work_elem_type smem[1][BLOCK_SIZE], work_type& myVal)
|
||||
{
|
||||
typename Reductor::template rebind<work_elem_type>::other op;
|
||||
blockReduce<BLOCK_SIZE>(smem[0], myVal, threadIdx.x, op);
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 2>
|
||||
{
|
||||
__device__ __forceinline__ static void call(work_elem_type smem[2][BLOCK_SIZE], work_type& myVal)
|
||||
{
|
||||
typename Reductor::template rebind<work_elem_type>::other op;
|
||||
blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1]), tie(myVal.x, myVal.y), threadIdx.x, make_tuple(op, op));
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 3>
|
||||
{
|
||||
__device__ __forceinline__ static void call(work_elem_type smem[3][BLOCK_SIZE], work_type& myVal)
|
||||
{
|
||||
typename Reductor::template rebind<work_elem_type>::other op;
|
||||
blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2]), tie(myVal.x, myVal.y, myVal.z), threadIdx.x, make_tuple(op, op, op));
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 4>
|
||||
{
|
||||
__device__ __forceinline__ static void call(work_elem_type smem[4][BLOCK_SIZE], work_type& myVal)
|
||||
{
|
||||
typename Reductor::template rebind<work_elem_type>::other op;
|
||||
blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2], smem[3]), tie(myVal.x, myVal.y, myVal.z, myVal.w), threadIdx.x, make_tuple(op, op, op, op));
|
||||
}
|
||||
};
|
||||
|
||||
template <class Reductor, int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
|
||||
__global__ void reduceToColumn(const SrcPtr src, ResType* dst, const MaskPtr mask, const int cols)
|
||||
{
|
||||
typedef typename Reductor::work_type work_type;
|
||||
typedef typename VecTraits<work_type>::elem_type work_elem_type;
|
||||
const int cn = VecTraits<work_type>::cn;
|
||||
|
||||
__shared__ work_type smem[BLOCK_SIZE];
|
||||
__shared__ work_elem_type smem[cn][BLOCK_SIZE];
|
||||
|
||||
const int y = blockIdx.x;
|
||||
|
||||
@ -75,7 +115,7 @@ namespace grid_reduce_to_vec_detail
|
||||
}
|
||||
}
|
||||
|
||||
blockReduce<BLOCK_SIZE>(smem, myVal, threadIdx.x, op);
|
||||
Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, cn>::call(smem, myVal);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
dst[y] = saturate_cast<ResType>(Reductor::result(myVal, cols));
|
||||
|
@ -49,6 +49,7 @@
|
||||
#include "../common.hpp"
|
||||
#include "../util/vec_traits.hpp"
|
||||
#include "../util/limits.hpp"
|
||||
#include "../util/saturate_cast.hpp"
|
||||
#include "../ptr2d/traits.hpp"
|
||||
#include "../ptr2d/gpumat.hpp"
|
||||
#include "../ptr2d/mask.hpp"
|
||||
@ -62,6 +63,11 @@ template <typename T> struct Sum : plus<T>
|
||||
{
|
||||
typedef T work_type;
|
||||
|
||||
template <typename U> struct rebind
|
||||
{
|
||||
typedef Sum<U> other;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ static T initialValue()
|
||||
{
|
||||
return VecTraits<T>::all(0);
|
||||
@ -77,14 +83,19 @@ template <typename T> struct Avg : plus<T>
|
||||
{
|
||||
typedef T work_type;
|
||||
|
||||
template <typename U> struct rebind
|
||||
{
|
||||
typedef Avg<U> other;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ static T initialValue()
|
||||
{
|
||||
return VecTraits<T>::all(0);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ static T result(T r, int sz)
|
||||
__device__ __forceinline__ static T result(T r, float sz)
|
||||
{
|
||||
return r / sz;
|
||||
return saturate_cast<T>(r / sz);
|
||||
}
|
||||
};
|
||||
|
||||
@ -92,6 +103,11 @@ template <typename T> struct Min : minimum<T>
|
||||
{
|
||||
typedef T work_type;
|
||||
|
||||
template <typename U> struct rebind
|
||||
{
|
||||
typedef Min<U> other;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ static T initialValue()
|
||||
{
|
||||
return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
|
||||
@ -107,6 +123,11 @@ template <typename T> struct Max : maximum<T>
|
||||
{
|
||||
typedef T work_type;
|
||||
|
||||
template <typename U> struct rebind
|
||||
{
|
||||
typedef Max<U> other;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ static T initialValue()
|
||||
{
|
||||
return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
|
||||
@ -158,7 +179,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, cons
|
||||
|
||||
CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
|
||||
|
||||
createContinuous(rows, 1, DataType<ResType>::type, dst);
|
||||
dst.create(1, rows);
|
||||
|
||||
grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
@ -173,7 +194,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, Stre
|
||||
const int rows = getRows(src);
|
||||
const int cols = getCols(src);
|
||||
|
||||
createContinuous(rows, 1, DataType<ResType>::type, dst);
|
||||
dst.create(1, rows);
|
||||
|
||||
grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
|
||||
dst[0],
|
||||
|
@ -228,6 +228,9 @@ TEST(ReduceToColumn, Sum)
|
||||
|
||||
Mat dst_gold;
|
||||
cv::reduce(src, dst_gold, 1, REDUCE_SUM, CV_32S);
|
||||
dst_gold.cols = dst_gold.rows;
|
||||
dst_gold.rows = 1;
|
||||
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
|
||||
}
|
||||
@ -244,6 +247,9 @@ TEST(ReduceToColumn, Avg)
|
||||
|
||||
Mat dst_gold;
|
||||
cv::reduce(src, dst_gold, 1, REDUCE_AVG, CV_32F);
|
||||
dst_gold.cols = dst_gold.rows;
|
||||
dst_gold.rows = 1;
|
||||
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 1e-4);
|
||||
}
|
||||
@ -260,6 +266,9 @@ TEST(ReduceToColumn, Min)
|
||||
|
||||
Mat dst_gold;
|
||||
cv::reduce(src, dst_gold, 1, REDUCE_MIN);
|
||||
dst_gold.cols = dst_gold.rows;
|
||||
dst_gold.rows = 1;
|
||||
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
|
||||
}
|
||||
@ -276,6 +285,9 @@ TEST(ReduceToColumn, Max)
|
||||
|
||||
Mat dst_gold;
|
||||
cv::reduce(src, dst_gold, 1, REDUCE_MAX);
|
||||
dst_gold.cols = dst_gold.rows;
|
||||
dst_gold.rows = 1;
|
||||
dst_gold.step = dst_gold.cols * dst_gold.elemSize();
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user