implemented gpu::reduce
This commit is contained in:
@@ -1804,4 +1804,278 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
template void sqrSumCaller<short>(const DevMem2D, PtrStep, double*, int);
|
||||
template void sqrSumCaller<int>(const DevMem2D, PtrStep, double*, int);
|
||||
template void sqrSumCaller<float>(const DevMem2D, PtrStep, double*, int);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// reduce
|
||||
|
||||
template <typename S> struct SumReductor
|
||||
{
|
||||
__device__ __forceinline__ S startValue() const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
|
||||
__device__ __forceinline S result(S r, double) const
|
||||
{
|
||||
return r;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename S> struct AvgReductor
|
||||
{
|
||||
__device__ __forceinline__ S startValue() const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
|
||||
__device__ __forceinline double result(S r, double sz) const
|
||||
{
|
||||
return r / sz;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename S> struct MinReductor
|
||||
{
|
||||
__device__ __forceinline__ S startValue() const
|
||||
{
|
||||
return numeric_limits<S>::max();
|
||||
}
|
||||
|
||||
template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const
|
||||
{
|
||||
return saturate_cast<T>(::min(a, b));
|
||||
}
|
||||
__device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
|
||||
{
|
||||
return ::fmin(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline S result(S r, double) const
|
||||
{
|
||||
return r;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename S> struct MaxReductor
|
||||
{
|
||||
__device__ __forceinline__ S startValue() const
|
||||
{
|
||||
return numeric_limits<S>::min();
|
||||
}
|
||||
|
||||
template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const
|
||||
{
|
||||
return ::max(a, b);
|
||||
}
|
||||
__device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
|
||||
{
|
||||
return ::fmax(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline S result(S r, double) const
|
||||
{
|
||||
return r;
|
||||
}
|
||||
};
|
||||
|
||||
template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)
|
||||
{
|
||||
__shared__ S smem[16 * 16];
|
||||
|
||||
const int x = blockIdx.x * 16 + threadIdx.x;
|
||||
|
||||
if (x < src.cols)
|
||||
{
|
||||
S myVal = op.startValue();
|
||||
|
||||
for (int y = threadIdx.y; y < src.rows; y += 16)
|
||||
myVal = op(myVal, src.ptr(y)[x]);
|
||||
|
||||
smem[threadIdx.y * 16 + threadIdx.x] = myVal;
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
myVal = smem[threadIdx.x];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 1; i < 16; ++i)
|
||||
myVal = op(myVal, smem[i * 16 + threadIdx.x]);
|
||||
|
||||
dst[x] = saturate_cast<D>(op.result(myVal, src.rows));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(src.cols, block.x));
|
||||
|
||||
Op<S> op;
|
||||
reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
reduceRows_caller<SumReductor, T, S, D>,
|
||||
reduceRows_caller<AvgReductor, T, S, D>,
|
||||
reduceRows_caller<MaxReductor, T, S, D>,
|
||||
reduceRows_caller<MinReductor, T, S, D>
|
||||
};
|
||||
|
||||
callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
|
||||
}
|
||||
|
||||
template void reduceRows_gpu<uchar, int, uchar>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceRows_gpu<uchar, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceRows_gpu<uchar, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
template void reduceRows_gpu<ushort, int, ushort>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceRows_gpu<ushort, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceRows_gpu<ushort, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
template void reduceRows_gpu<short, int, short>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceRows_gpu<short, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceRows_gpu<short, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
template void reduceRows_gpu<int, int, int>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceRows_gpu<int, int, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
template void reduceRows_gpu<float, float, float>(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
|
||||
|
||||
|
||||
template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
|
||||
{
|
||||
__shared__ S smem[256 * cn];
|
||||
|
||||
const int y = blockIdx.x;
|
||||
|
||||
const T* src_row = src.ptr(y);
|
||||
|
||||
S myVal[cn];
|
||||
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
myVal[c] = op.startValue();
|
||||
|
||||
for (int x = threadIdx.x; x < src.cols; x += 256)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
myVal[c] = op(myVal[c], src_row[x * cn + c]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
smem[c * 256 + threadIdx.x] = myVal[c];
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 128)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 64)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
volatile S* sdata = smem;
|
||||
|
||||
if (threadIdx.x < 32)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
{
|
||||
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);
|
||||
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);
|
||||
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);
|
||||
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);
|
||||
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);
|
||||
sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int c = 0; c < cn; ++c)
|
||||
dst[y * cn + c] = saturate_cast<D>(op.result(smem[c * 256], src.cols));
|
||||
}
|
||||
}
|
||||
|
||||
template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
|
||||
{
|
||||
const dim3 block(256);
|
||||
const dim3 grid(src.rows);
|
||||
|
||||
Op<S> op;
|
||||
reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
}
|
||||
|
||||
template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[4][4] =
|
||||
{
|
||||
{reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
|
||||
{reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
|
||||
{reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},
|
||||
{reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
|
||||
};
|
||||
|
||||
callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
|
||||
}
|
||||
|
||||
template void reduceCols_gpu<uchar, int, uchar>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceCols_gpu<uchar, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceCols_gpu<uchar, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
template void reduceCols_gpu<ushort, int, ushort>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceCols_gpu<ushort, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceCols_gpu<ushort, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
template void reduceCols_gpu<short, int, short>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceCols_gpu<short, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceCols_gpu<short, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
template void reduceCols_gpu<int, int, int>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template void reduceCols_gpu<int, int, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
template void reduceCols_gpu<float, float, float>(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
}}}
|
@@ -63,6 +63,7 @@ void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const G
|
||||
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
|
||||
int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
|
||||
int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
|
||||
void cv::gpu::reduce(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }
|
||||
|
||||
#else
|
||||
|
||||
@@ -598,4 +599,150 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
|
||||
return caller(src, buf);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// reduce
|
||||
|
||||
namespace cv { namespace gpu { namespace mathfunc {
|
||||
template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::mathfunc;
|
||||
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
|
||||
CV_Assert(dim == 0 || dim == 1);
|
||||
CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);
|
||||
|
||||
if (dtype < 0)
|
||||
dtype = src.depth();
|
||||
|
||||
dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKETYPE(dtype, src.channels()));
|
||||
|
||||
if (dim == 0)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[6][6] =
|
||||
{
|
||||
{
|
||||
reduceRows_gpu<unsigned char, int, unsigned char>,
|
||||
0/*reduceRows_gpu<unsigned char, int, signed char>*/,
|
||||
0/*reduceRows_gpu<unsigned char, int, unsigned short>*/,
|
||||
0/*reduceRows_gpu<unsigned char, int, short>*/,
|
||||
reduceRows_gpu<unsigned char, int, int>,
|
||||
reduceRows_gpu<unsigned char, int, float>
|
||||
},
|
||||
{
|
||||
0/*reduceRows_gpu<signed char, int, unsigned char>*/,
|
||||
0/*reduceRows_gpu<signed char, int, signed char>*/,
|
||||
0/*reduceRows_gpu<signed char, int, unsigned short>*/,
|
||||
0/*reduceRows_gpu<signed char, int, short>*/,
|
||||
0/*reduceRows_gpu<signed char, int, int>*/,
|
||||
0/*reduceRows_gpu<signed char, int, float>*/
|
||||
},
|
||||
{
|
||||
0/*reduceRows_gpu<unsigned short, int, unsigned char>*/,
|
||||
0/*reduceRows_gpu<unsigned short, int, signed char>*/,
|
||||
reduceRows_gpu<unsigned short, int, unsigned short>,
|
||||
0/*reduceRows_gpu<unsigned short, int, short>*/,
|
||||
reduceRows_gpu<unsigned short, int, int>,
|
||||
reduceRows_gpu<unsigned short, int, float>
|
||||
},
|
||||
{
|
||||
0/*reduceRows_gpu<short, int, unsigned char>*/,
|
||||
0/*reduceRows_gpu<short, int, signed char>*/,
|
||||
0/*reduceRows_gpu<short, int, unsigned short>*/,
|
||||
reduceRows_gpu<short, int, short>,
|
||||
reduceRows_gpu<short, int, int>,
|
||||
reduceRows_gpu<short, int, float>
|
||||
},
|
||||
{
|
||||
0/*reduceRows_gpu<int, int, unsigned char>*/,
|
||||
0/*reduceRows_gpu<int, int, signed char>*/,
|
||||
0/*reduceRows_gpu<int, int, unsigned short>*/,
|
||||
0/*reduceRows_gpu<int, int, short>*/,
|
||||
reduceRows_gpu<int, int, int>,
|
||||
reduceRows_gpu<int, int, float>
|
||||
},
|
||||
{
|
||||
0/*reduceRows_gpu<float, float, unsigned char>*/,
|
||||
0/*reduceRows_gpu<float, float, signed char>*/,
|
||||
0/*reduceRows_gpu<float, float, unsigned short>*/,
|
||||
0/*reduceRows_gpu<float, float, short>*/,
|
||||
0/*reduceRows_gpu<float, float, int>*/,
|
||||
reduceRows_gpu<float, float, float>
|
||||
}
|
||||
};
|
||||
|
||||
const caller_t func = callers[src.depth()][dst.depth()];
|
||||
if (!func)
|
||||
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
func(src.reshape(1), dst.reshape(1), reduceOp, StreamAccessor::getStream(stream));
|
||||
}
|
||||
else
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2D& src, int cn, const DevMem2D& dst, int reduceOp, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[6][6] =
|
||||
{
|
||||
{
|
||||
reduceCols_gpu<unsigned char, int, unsigned char>,
|
||||
0/*reduceCols_gpu<unsigned char, int, signed char>*/,
|
||||
0/*reduceCols_gpu<unsigned char, int, unsigned short>*/,
|
||||
0/*reduceCols_gpu<unsigned char, int, short>*/,
|
||||
reduceCols_gpu<unsigned char, int, int>,
|
||||
reduceCols_gpu<unsigned char, int, float>
|
||||
},
|
||||
{
|
||||
0/*reduceCols_gpu<signed char, int, unsigned char>*/,
|
||||
0/*reduceCols_gpu<signed char, int, signed char>*/,
|
||||
0/*reduceCols_gpu<signed char, int, unsigned short>*/,
|
||||
0/*reduceCols_gpu<signed char, int, short>*/,
|
||||
0/*reduceCols_gpu<signed char, int, int>*/,
|
||||
0/*reduceCols_gpu<signed char, int, float>*/
|
||||
},
|
||||
{
|
||||
0/*reduceCols_gpu<unsigned short, int, unsigned char>*/,
|
||||
0/*reduceCols_gpu<unsigned short, int, signed char>*/,
|
||||
reduceCols_gpu<unsigned short, int, unsigned short>,
|
||||
0/*reduceCols_gpu<unsigned short, int, short>*/,
|
||||
reduceCols_gpu<unsigned short, int, int>,
|
||||
reduceCols_gpu<unsigned short, int, float>
|
||||
},
|
||||
{
|
||||
0/*reduceCols_gpu<short, int, unsigned char>*/,
|
||||
0/*reduceCols_gpu<short, int, signed char>*/,
|
||||
0/*reduceCols_gpu<short, int, unsigned short>*/,
|
||||
reduceCols_gpu<short, int, short>,
|
||||
reduceCols_gpu<short, int, int>,
|
||||
reduceCols_gpu<short, int, float>
|
||||
},
|
||||
{
|
||||
0/*reduceCols_gpu<int, int, unsigned char>*/,
|
||||
0/*reduceCols_gpu<int, int, signed char>*/,
|
||||
0/*reduceCols_gpu<int, int, unsigned short>*/,
|
||||
0/*reduceCols_gpu<int, int, short>*/,
|
||||
reduceCols_gpu<int, int, int>,
|
||||
reduceCols_gpu<int, int, float>
|
||||
},
|
||||
{
|
||||
0/*reduceCols_gpu<float, unsigned char>*/,
|
||||
0/*reduceCols_gpu<float, signed char>*/,
|
||||
0/*reduceCols_gpu<float, unsigned short>*/,
|
||||
0/*reduceCols_gpu<float, short>*/,
|
||||
0/*reduceCols_gpu<float, int>*/,
|
||||
reduceCols_gpu<float, float, float>
|
||||
}
|
||||
};
|
||||
|
||||
const caller_t func = callers[src.depth()][dst.depth()];
|
||||
if (!func)
|
||||
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
|
||||
|
||||
func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user