added performance tests for gpu::reduce
This commit is contained in:
parent
66b41b67f9
commit
67a9b79433
@ -716,3 +716,34 @@ PERF_TEST_P(DevInfo_Size_MatType, addWeighted, testing::Combine(testing::ValuesI
|
|||||||
|
|
||||||
SANITY_CHECK(dst_host);
|
SANITY_CHECK(dst_host);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::ValuesIn(devices()),
|
||||||
|
testing::Values(GPU_TYPICAL_MAT_SIZES),
|
||||||
|
testing::Values(CV_8UC1, CV_8UC4, CV_32FC1),
|
||||||
|
testing::Values((int)HORIZONTAL_AXIS, (int)VERTICAL_AXIS)))
|
||||||
|
{
|
||||||
|
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
|
||||||
|
Size size = std::tr1::get<1>(GetParam());
|
||||||
|
int type = std::tr1::get<2>(GetParam());
|
||||||
|
int dim = std::tr1::get<3>(GetParam());
|
||||||
|
|
||||||
|
setDevice(devInfo.deviceID());
|
||||||
|
|
||||||
|
Mat src_host(size, type);
|
||||||
|
|
||||||
|
declare.in(src_host, WARMUP_RNG);
|
||||||
|
|
||||||
|
GpuMat src(src_host);
|
||||||
|
GpuMat dst(size, type);
|
||||||
|
|
||||||
|
declare.time(0.5).iterations(100);
|
||||||
|
|
||||||
|
SIMPLE_TEST_CYCLE()
|
||||||
|
{
|
||||||
|
reduce(src, dst, dim, CV_REDUCE_MIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
Mat dst_host = dst;
|
||||||
|
|
||||||
|
SANITY_CHECK(dst_host);
|
||||||
|
}
|
||||||
|
@ -1894,27 +1894,29 @@ namespace cv { namespace gpu { namespace mathfunc
|
|||||||
|
|
||||||
const int x = blockIdx.x * 16 + threadIdx.x;
|
const int x = blockIdx.x * 16 + threadIdx.x;
|
||||||
|
|
||||||
|
S myVal = op.startValue();
|
||||||
|
|
||||||
if (x < src.cols)
|
if (x < src.cols)
|
||||||
{
|
{
|
||||||
S myVal = op.startValue();
|
|
||||||
|
|
||||||
for (int y = threadIdx.y; y < src.rows; y += 16)
|
for (int y = threadIdx.y; y < src.rows; y += 16)
|
||||||
myVal = op(myVal, src.ptr(y)[x]);
|
myVal = op(myVal, src.ptr(y)[x]);
|
||||||
|
|
||||||
smem[threadIdx.y * 16 + threadIdx.x] = myVal;
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (threadIdx.y == 0)
|
|
||||||
{
|
|
||||||
myVal = smem[threadIdx.x];
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 1; i < 16; ++i)
|
|
||||||
myVal = op(myVal, smem[i * 16 + threadIdx.x]);
|
|
||||||
|
|
||||||
dst[x] = saturate_cast<D>(op.result(myVal, src.rows));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
smem[threadIdx.x * 16 + threadIdx.y] = myVal;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (threadIdx.x < 8)
|
||||||
|
{
|
||||||
|
volatile S* srow = smem + threadIdx.y * 16;
|
||||||
|
srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);
|
||||||
|
srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);
|
||||||
|
srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);
|
||||||
|
srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (threadIdx.y == 0 && x < src.cols)
|
||||||
|
dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
|
template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
|
||||||
@ -1965,7 +1967,6 @@ namespace cv { namespace gpu { namespace mathfunc
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
|
template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
|
||||||
{
|
{
|
||||||
__shared__ S smem[256 * cn];
|
__shared__ S smem[256 * cn];
|
||||||
@ -1980,6 +1981,9 @@ namespace cv { namespace gpu { namespace mathfunc
|
|||||||
for (int c = 0; c < cn; ++c)
|
for (int c = 0; c < cn; ++c)
|
||||||
myVal[c] = op.startValue();
|
myVal[c] = op.startValue();
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ >= 200
|
||||||
|
|
||||||
|
// For cc >= 2.0 prefer L1 cache
|
||||||
for (int x = threadIdx.x; x < src.cols; x += 256)
|
for (int x = threadIdx.x; x < src.cols; x += 256)
|
||||||
{
|
{
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@ -1987,6 +1991,29 @@ namespace cv { namespace gpu { namespace mathfunc
|
|||||||
myVal[c] = op(myVal[c], src_row[x * cn + c]);
|
myVal[c] = op(myVal[c], src_row[x * cn + c]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else // __CUDA_ARCH__ >= 200
|
||||||
|
|
||||||
|
// For older arch use shared memory for cache
|
||||||
|
for (int x = 0; x < src.cols; x += 256)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
{
|
||||||
|
smem[c * 256 + threadIdx.x] = op.startValue();
|
||||||
|
const int load_x = x * cn + c * 256 + threadIdx.x;
|
||||||
|
if (load_x < src.cols * cn)
|
||||||
|
smem[c * 256 + threadIdx.x] = src_row[load_x];
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __CUDA_ARCH__ >= 200
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int c = 0; c < cn; ++c)
|
for (int c = 0; c < cn; ++c)
|
||||||
smem[c * 256 + threadIdx.x] = myVal[c];
|
smem[c * 256 + threadIdx.x] = myVal[c];
|
||||||
@ -2025,12 +2052,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
|||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x < cn)
|
||||||
{
|
dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
|
||||||
#pragma unroll
|
|
||||||
for (int c = 0; c < cn; ++c)
|
|
||||||
dst[y * cn + c] = saturate_cast<D>(op.result(smem[c * 256], src.cols));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
|
template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
|
||||||
|
@ -1419,3 +1419,39 @@ TEST(Canny)
|
|||||||
gpu::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
|
gpu::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
|
||||||
GPU_OFF;
|
GPU_OFF;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TEST(reduce)
|
||||||
|
{
|
||||||
|
for (int size = 1000; size < 4000; size += 1000)
|
||||||
|
{
|
||||||
|
Mat src;
|
||||||
|
gen(src, size, size, CV_32F, 0, 255);
|
||||||
|
Mat dst0(1, src.cols, CV_32F);
|
||||||
|
Mat dst1(src.rows, 1, CV_32F);
|
||||||
|
|
||||||
|
gpu::GpuMat d_src(src);
|
||||||
|
gpu::GpuMat d_dst0(1, src.cols, CV_32F);
|
||||||
|
gpu::GpuMat d_dst1(1, src.rows, CV_32F);
|
||||||
|
|
||||||
|
SUBTEST << "size " << size << ", dim = 0";
|
||||||
|
|
||||||
|
CPU_ON;
|
||||||
|
reduce(src, dst0, 0, CV_REDUCE_MIN);
|
||||||
|
CPU_OFF;
|
||||||
|
|
||||||
|
GPU_ON;
|
||||||
|
gpu::reduce(d_src, d_dst0, 0, CV_REDUCE_MIN);
|
||||||
|
GPU_OFF;
|
||||||
|
|
||||||
|
SUBTEST << "size " << size << ", dim = 1";
|
||||||
|
|
||||||
|
CPU_ON;
|
||||||
|
reduce(src, dst1, 1, CV_REDUCE_MIN);
|
||||||
|
CPU_OFF;
|
||||||
|
|
||||||
|
GPU_ON;
|
||||||
|
gpu::reduce(d_src, d_dst1, 1, CV_REDUCE_MIN);
|
||||||
|
GPU_OFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user