diff --git a/modules/cudaarithm/perf/perf_arithm.cpp b/modules/cudaarithm/perf/perf_arithm.cpp index d0f3e6617..900415501 100644 --- a/modules/cudaarithm/perf/perf_arithm.cpp +++ b/modules/cudaarithm/perf/perf_arithm.cpp @@ -248,60 +248,3 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve, CPU_SANITY_CHECK(dst); } } - -////////////////////////////////////////////////////////////////////// -// Integral - -PERF_TEST_P(Sz, Integral, - CUDA_TYPICAL_MAT_SIZES) -{ - const cv::Size size = GetParam(); - - cv::Mat src(size, CV_8UC1); - declare.in(src, WARMUP_RNG); - - if (PERF_RUN_CUDA()) - { - const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat dst; - cv::cuda::GpuMat d_buf; - - TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf); - - CUDA_SANITY_CHECK(dst); - } - else - { - cv::Mat dst; - - TEST_CYCLE() cv::integral(src, dst); - - CPU_SANITY_CHECK(dst); - } -} - -////////////////////////////////////////////////////////////////////// -// IntegralSqr - -PERF_TEST_P(Sz, IntegralSqr, - CUDA_TYPICAL_MAT_SIZES) -{ - const cv::Size size = GetParam(); - - cv::Mat src(size, CV_8UC1); - declare.in(src, WARMUP_RNG); - - if (PERF_RUN_CUDA()) - { - const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat dst, buf; - - TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf); - - CUDA_SANITY_CHECK(dst); - } - else - { - FAIL_NO_CPU(); - } -} diff --git a/modules/cudaarithm/perf/perf_reductions.cpp b/modules/cudaarithm/perf/perf_reductions.cpp index aa79bf499..470df48a3 100644 --- a/modules/cudaarithm/perf/perf_reductions.cpp +++ b/modules/cudaarithm/perf/perf_reductions.cpp @@ -465,3 +465,60 @@ PERF_TEST_P(Sz, MeanStdDev, SANITY_CHECK(cpu_stddev); } } + +////////////////////////////////////////////////////////////////////// +// Integral + +PERF_TEST_P(Sz, Integral, + CUDA_TYPICAL_MAT_SIZES) +{ + const cv::Size size = GetParam(); + + cv::Mat src(size, CV_8UC1); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_CUDA()) + { + const cv::cuda::GpuMat d_src(src); + cv::cuda::GpuMat dst; + cv::cuda::GpuMat d_buf; + + TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf); + + CUDA_SANITY_CHECK(dst); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::integral(src, dst); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// IntegralSqr + +PERF_TEST_P(Sz, IntegralSqr, + CUDA_TYPICAL_MAT_SIZES) +{ + const cv::Size size = GetParam(); + + cv::Mat src(size, CV_8UC1); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_CUDA()) + { + const cv::cuda::GpuMat d_src(src); + cv::cuda::GpuMat dst, buf; + + TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf); + + CUDA_SANITY_CHECK(dst); + } + else + { + FAIL_NO_CPU(); + } +} diff --git a/modules/cudaarithm/src/cuda/integral.cu b/modules/cudaarithm/src/cuda/integral.cu index ef49f1814..db554eb30 100644 --- a/modules/cudaarithm/src/cuda/integral.cu +++ b/modules/cudaarithm/src/cuda/integral.cu @@ -40,433 +40,61 @@ // //M*/ -#if !defined CUDA_DISABLER +#include "opencv2/opencv_modules.hpp" -#include "opencv2/core/cuda/common.hpp" +#ifndef HAVE_OPENCV_CUDEV -namespace cv { namespace cuda { namespace device +#error "opencv_cudev is required" + +#else + +#include "opencv2/cudaarithm.hpp" +#include "opencv2/cudev.hpp" + +using namespace cv::cudev; + +//////////////////////////////////////////////////////////////////////// +// integral + +void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& stream) { - namespace imgproc - { - // Utility function to extract unsigned chars from an unsigned integer - __device__ uchar4 int_to_uchar4(unsigned int in) - { - uchar4 bytes; - bytes.x = (in & 0x000000ff) >> 0; - bytes.y = (in & 0x0000ff00) >> 8; - bytes.z = (in & 0x00ff0000) >> 16; - bytes.w = (in & 0xff000000) >> 24; - return bytes; - } + GpuMat src = _src.getGpuMat(); - __global__ void shfl_integral_horizontal(const PtrStep img, PtrStep integral) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) - __shared__ int sums[128]; + CV_Assert( src.type() == CV_8UC1 ); - const int id = threadIdx.x; - const int lane_id = id % warpSize; - const int warp_id = id / warpSize; + GpuMat_& res = (GpuMat_&) buffer; - const uint4 data = img(blockIdx.x, id); + gridIntegral(globPtr(src), res, stream); - const uchar4 a = int_to_uchar4(data.x); - const uchar4 b = int_to_uchar4(data.y); - const uchar4 c = int_to_uchar4(data.z); - const uchar4 d = int_to_uchar4(data.w); + _dst.create(src.rows + 1, src.cols + 1, CV_32SC1); + GpuMat dst = _dst.getGpuMat(); - int result[16]; + dst.setTo(Scalar::all(0), stream); - result[0] = a.x; - result[1] = result[0] + a.y; - result[2] = result[1] + a.z; - result[3] = result[2] + a.w; + GpuMat inner = dst(Rect(1, 1, src.cols, src.rows)); + res.copyTo(inner, stream); +} - result[4] = result[3] + b.x; - result[5] = result[4] + b.y; - result[6] = result[5] + b.z; - result[7] = result[6] + b.w; +////////////////////////////////////////////////////////////////////////////// +// sqrIntegral - result[8] = result[7] + c.x; - result[9] = result[8] + c.y; - result[10] = result[9] + c.z; - result[11] = result[10] + c.w; +void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& stream) +{ + GpuMat src = _src.getGpuMat(); - result[12] = result[11] + d.x; - result[13] = result[12] + d.y; - result[14] = result[13] + d.z; - result[15] = result[14] + d.w; + CV_Assert( src.type() == CV_8UC1 ); - int sum = result[15]; + GpuMat_& res = (GpuMat_&) buf; - // the prefix sum for each thread's 16 value is computed, - // now the final sums (result[15]) need to be shared - // with the other threads and add. To do this, - // the __shfl_up() instruction is used and a shuffle scan - // operation is performed to distribute the sums to the correct - // threads - #pragma unroll - for (int i = 1; i < 32; i *= 2) - { - const int n = __shfl_up(sum, i, 32); + gridIntegral(sqr_(cvt_(globPtr(src))), res, stream); - if (lane_id >= i) - { - #pragma unroll - for (int i = 0; i < 16; ++i) - result[i] += n; + _dst.create(src.rows + 1, src.cols + 1, CV_64FC1); + GpuMat dst = _dst.getGpuMat(); - sum += n; - } - } + dst.setTo(Scalar::all(0), stream); - // Now the final sum for the warp must be shared - // between warps. This is done by each warp - // having a thread store to shared memory, then - // having some other warp load the values and - // compute a prefix sum, again by using __shfl_up. - // The results are uniformly added back to the warps. - // last thread in the warp holding sum of the warp - // places that in shared - if (threadIdx.x % warpSize == warpSize - 1) - sums[warp_id] = result[15]; + GpuMat inner = dst(Rect(1, 1, src.cols, src.rows)); + res.copyTo(inner, stream); +} - __syncthreads(); - - if (warp_id == 0) - { - int warp_sum = sums[lane_id]; - - #pragma unroll - for (int i = 1; i <= 32; i *= 2) - { - const int n = __shfl_up(warp_sum, i, 32); - - if (lane_id >= i) - warp_sum += n; - } - - sums[lane_id] = warp_sum; - } - - __syncthreads(); - - int blockSum = 0; - - // fold in unused warp - if (warp_id > 0) - { - blockSum = sums[warp_id - 1]; - - #pragma unroll - for (int i = 0; i < 16; ++i) - result[i] += blockSum; - } - - // assemble result - // Each thread has 16 values to write, which are - // now integer data (to avoid overflow). Instead of - // each thread writing consecutive uint4s, the - // approach shown here experiments using - // the shuffle command to reformat the data - // inside the registers so that each thread holds - // consecutive data to be written so larger contiguous - // segments can be assembled for writing. - - /* - For example data that needs to be written as - - GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3 - but is stored in registers (r0..r3), in four threads (0..3) as: - - threadId 0 1 2 3 - r0 x0 y0 z0 w0 - r1 x1 y1 z1 w1 - r2 x2 y2 z2 w2 - r3 x3 y3 z3 w3 - - after apply __shfl_xor operations to move data between registers r1..r3: - - threadId 00 01 10 11 - x0 y0 z0 w0 - xor(01)->y1 x1 w1 z1 - xor(10)->z2 w2 x2 y2 - xor(11)->w3 z3 y3 x3 - - and now x0..x3, and z0..z3 can be written out in order by all threads. - - In the current code, each register above is actually representing - four integers to be written as uint4's to GMEM. - */ - - result[4] = __shfl_xor(result[4] , 1, 32); - result[5] = __shfl_xor(result[5] , 1, 32); - result[6] = __shfl_xor(result[6] , 1, 32); - result[7] = __shfl_xor(result[7] , 1, 32); - - result[8] = __shfl_xor(result[8] , 2, 32); - result[9] = __shfl_xor(result[9] , 2, 32); - result[10] = __shfl_xor(result[10], 2, 32); - result[11] = __shfl_xor(result[11], 2, 32); - - result[12] = __shfl_xor(result[12], 3, 32); - result[13] = __shfl_xor(result[13], 3, 32); - result[14] = __shfl_xor(result[14], 3, 32); - result[15] = __shfl_xor(result[15], 3, 32); - - uint4* integral_row = integral.ptr(blockIdx.x); - uint4 output; - - /////// - - if (threadIdx.x % 4 == 0) - output = make_uint4(result[0], result[1], result[2], result[3]); - - if (threadIdx.x % 4 == 1) - output = make_uint4(result[4], result[5], result[6], result[7]); - - if (threadIdx.x % 4 == 2) - output = make_uint4(result[8], result[9], result[10], result[11]); - - if (threadIdx.x % 4 == 3) - output = make_uint4(result[12], result[13], result[14], result[15]); - - integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output; - - /////// - - if (threadIdx.x % 4 == 2) - output = make_uint4(result[0], result[1], result[2], result[3]); - - if (threadIdx.x % 4 == 3) - output = make_uint4(result[4], result[5], result[6], result[7]); - - if (threadIdx.x % 4 == 0) - output = make_uint4(result[8], result[9], result[10], result[11]); - - if (threadIdx.x % 4 == 1) - output = make_uint4(result[12], result[13], result[14], result[15]); - - integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output; - - // continuning from the above example, - // this use of __shfl_xor() places the y0..y3 and w0..w3 data - // in order. - - #pragma unroll - for (int i = 0; i < 16; ++i) - result[i] = __shfl_xor(result[i], 1, 32); - - if (threadIdx.x % 4 == 0) - output = make_uint4(result[0], result[1], result[2], result[3]); - - if (threadIdx.x % 4 == 1) - output = make_uint4(result[4], result[5], result[6], result[7]); - - if (threadIdx.x % 4 == 2) - output = make_uint4(result[8], result[9], result[10], result[11]); - - if (threadIdx.x % 4 == 3) - output = make_uint4(result[12], result[13], result[14], result[15]); - - integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output; - - /////// - - if (threadIdx.x % 4 == 2) - output = make_uint4(result[0], result[1], result[2], result[3]); - - if (threadIdx.x % 4 == 3) - output = make_uint4(result[4], result[5], result[6], result[7]); - - if (threadIdx.x % 4 == 0) - output = make_uint4(result[8], result[9], result[10], result[11]); - - if (threadIdx.x % 4 == 1) - output = make_uint4(result[12], result[13], result[14], result[15]); - - integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output; - #endif - } - - // This kernel computes columnwise prefix sums. When the data input is - // the row sums from above, this completes the integral image. - // The approach here is to have each block compute a local set of sums. - // First , the data covered by the block is loaded into shared memory, - // then instead of performing a sum in shared memory using __syncthreads - // between stages, the data is reformatted so that the necessary sums - // occur inside warps and the shuffle scan operation is used. - // The final set of sums from the block is then propgated, with the block - // computing "down" the image and adding the running sum to the local - // block sums. - __global__ void shfl_integral_vertical(PtrStepSz integral) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) - __shared__ unsigned int sums[32][9]; - - const int tidx = blockIdx.x * blockDim.x + threadIdx.x; - const int lane_id = tidx % 8; - - if (tidx >= integral.cols) - return; - - sums[threadIdx.x][threadIdx.y] = 0; - __syncthreads(); - - unsigned int stepSum = 0; - - for (int y = threadIdx.y; y < integral.rows; y += blockDim.y) - { - unsigned int* p = integral.ptr(y) + tidx; - - unsigned int sum = *p; - - sums[threadIdx.x][threadIdx.y] = sum; - __syncthreads(); - - // place into SMEM - // shfl scan reduce the SMEM, reformating so the column - // sums are computed in a warp - // then read out properly - const int j = threadIdx.x % 8; - const int k = threadIdx.x / 8 + threadIdx.y * 4; - - int partial_sum = sums[k][j]; - - for (int i = 1; i <= 8; i *= 2) - { - int n = __shfl_up(partial_sum, i, 32); - - if (lane_id >= i) - partial_sum += n; - } - - sums[k][j] = partial_sum; - __syncthreads(); - - if (threadIdx.y > 0) - sum += sums[threadIdx.x][threadIdx.y - 1]; - - sum += stepSum; - stepSum += sums[threadIdx.x][blockDim.y - 1]; - - __syncthreads(); - - *p = sum; - } - #endif - } - - void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz integral, cudaStream_t stream) - { - { - // each thread handles 16 values, use 1 block/row - // save, becouse step is actually can't be less 512 bytes - int block = integral.cols / 16; - - // launch 1 block / row - const int grid = img.rows; - - cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); - - shfl_integral_horizontal<<>>((const PtrStepSz) img, (PtrStepSz) integral); - cudaSafeCall( cudaGetLastError() ); - } - - { - const dim3 block(32, 8); - const dim3 grid(divUp(integral.cols, block.x), 1); - - shfl_integral_vertical<<>>(integral); - cudaSafeCall( cudaGetLastError() ); - } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - __global__ void shfl_integral_vertical(PtrStepSz buffer, PtrStepSz integral) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) - __shared__ unsigned int sums[32][9]; - - const int tidx = blockIdx.x * blockDim.x + threadIdx.x; - const int lane_id = tidx % 8; - - if (tidx >= integral.cols) - return; - - sums[threadIdx.x][threadIdx.y] = 0; - __syncthreads(); - - unsigned int stepSum = 0; - - for (int y = threadIdx.y; y < integral.rows; y += blockDim.y) - { - unsigned int* p = buffer.ptr(y) + tidx; - unsigned int* dst = integral.ptr(y + 1) + tidx + 1; - - unsigned int sum = *p; - - sums[threadIdx.x][threadIdx.y] = sum; - __syncthreads(); - - // place into SMEM - // shfl scan reduce the SMEM, reformating so the column - // sums are computed in a warp - // then read out properly - const int j = threadIdx.x % 8; - const int k = threadIdx.x / 8 + threadIdx.y * 4; - - int partial_sum = sums[k][j]; - - for (int i = 1; i <= 8; i *= 2) - { - int n = __shfl_up(partial_sum, i, 32); - - if (lane_id >= i) - partial_sum += n; - } - - sums[k][j] = partial_sum; - __syncthreads(); - - if (threadIdx.y > 0) - sum += sums[threadIdx.x][threadIdx.y - 1]; - - sum += stepSum; - stepSum += sums[threadIdx.x][blockDim.y - 1]; - - __syncthreads(); - - *dst = sum; - } - #endif - } - - // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed - void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz buffer, PtrStepSz integral, - int blockStep, cudaStream_t stream) - { - { - const int block = blockStep; - const int grid = img.rows; - - cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); - - shfl_integral_horizontal<<>>((PtrStepSz) img, buffer); - cudaSafeCall( cudaGetLastError() ); - } - - { - const dim3 block(32, 8); - const dim3 grid(divUp(integral.cols, block.x), 1); - - shfl_integral_vertical<<>>((PtrStepSz)buffer, integral); - cudaSafeCall( cudaGetLastError() ); - } - } - } -}}} - -#endif /* CUDA_DISABLER */ +#endif diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index 81307f4d5..5a4a2df0d 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -294,116 +294,4 @@ void cv::cuda::normalize(InputArray _src, OutputArray dst, double a, double b, i } } -//////////////////////////////////////////////////////////////////////// -// integral - -namespace cv { namespace cuda { namespace device -{ - namespace imgproc - { - void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz integral, cudaStream_t stream); - } -}}} - -void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& _stream) -{ - GpuMat src = _src.getGpuMat(); - - CV_Assert( src.type() == CV_8UC1 ); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - cv::Size whole; - cv::Point offset; - src.locateROI(whole, offset); - - if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 - && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast(src.step) - offset.x)) - { - ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer); - - cv::cuda::device::imgproc::shfl_integral_gpu(src, buffer, stream); - - _dst.create(src.rows + 1, src.cols + 1, CV_32SC1); - GpuMat dst = _dst.getGpuMat(); - - dst.setTo(Scalar::all(0), _stream); - - GpuMat inner = dst(Rect(1, 1, src.cols, src.rows)); - GpuMat res = buffer(Rect(0, 0, src.cols, src.rows)); - - res.copyTo(inner, _stream); - } - else - { - #ifndef HAVE_OPENCV_CUDALEGACY - throw_no_cuda(); - #else - _dst.create(src.rows + 1, src.cols + 1, CV_32SC1); - GpuMat dst = _dst.getGpuMat(); - - NcvSize32u roiSize; - roiSize.width = src.cols; - roiSize.height = src.rows; - - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) ); - - Ncv32u bufSize; - ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); - ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer); - - NppStStreamHandler h(stream); - - ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), static_cast(src.step), - dst.ptr(), static_cast(dst.step), roiSize, buffer.ptr(), bufSize, prop) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - #endif - } -} - -////////////////////////////////////////////////////////////////////////////// -// sqrIntegral - -void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& _stream) -{ -#ifndef HAVE_OPENCV_CUDALEGACY - (void) _src; - (void) _dst; - (void) _stream; - throw_no_cuda(); -#else - GpuMat src = _src.getGpuMat(); - - CV_Assert( src.type() == CV_8U ); - - NcvSize32u roiSize; - roiSize.width = src.cols; - roiSize.height = src.rows; - - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) ); - - Ncv32u bufSize; - ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop)); - - ensureSizeIsEnough(1, bufSize, CV_8U, buf); - - cudaStream_t stream = StreamAccessor::getStream(_stream); - - NppStStreamHandler h(stream); - - _dst.create(src.rows + 1, src.cols + 1, CV_64F); - GpuMat dst = _dst.getGpuMat(); - - ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast(src.ptr(0)), static_cast(src.step), - dst.ptr(0), static_cast(dst.step), roiSize, buf.ptr(0), bufSize, prop)); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -#endif -} - #endif diff --git a/modules/cudaarithm/test/test_arithm.cpp b/modules/cudaarithm/test/test_arithm.cpp index bd3f250f3..0ee4e3469 100644 --- a/modules/cudaarithm/test/test_arithm.cpp +++ b/modules/cudaarithm/test/test_arithm.cpp @@ -125,43 +125,6 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, GEMM, testing::Combine( ALL_GEMM_FLAGS, WHOLE_SUBMAT)); -/////////////////////////////////////////////////////////////////////////////////////////////////////// -// Integral - -PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi) -{ - cv::cuda::DeviceInfo devInfo; - cv::Size size; - bool useRoi; - - virtual void SetUp() - { - devInfo = GET_PARAM(0); - size = GET_PARAM(1); - useRoi = GET_PARAM(2); - - cv::cuda::setDevice(devInfo.deviceID()); - } -}; - -CUDA_TEST_P(Integral, Accuracy) -{ - cv::Mat src = randomMat(size, CV_8UC1); - - cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi); - cv::cuda::integral(loadMat(src, useRoi), dst); - - cv::Mat dst_gold; - cv::integral(src, dst_gold, CV_32S); - - EXPECT_MAT_NEAR(dst_gold, dst, 0.0); -} - -INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine( - ALL_DEVICES, - DIFFERENT_SIZES, - WHOLE_SUBMAT)); - //////////////////////////////////////////////////////////////////////////// // MulSpectrums diff --git a/modules/cudaarithm/test/test_reductions.cpp b/modules/cudaarithm/test/test_reductions.cpp index 69cb6aff3..68974bcef 100644 --- a/modules/cudaarithm/test/test_reductions.cpp +++ b/modules/cudaarithm/test/test_reductions.cpp @@ -816,4 +816,78 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine( DIFFERENT_SIZES, WHOLE_SUBMAT)); +/////////////////////////////////////////////////////////////////////////////////////////////////////// +// Integral + +PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi) +{ + cv::cuda::DeviceInfo devInfo; + cv::Size size; + bool useRoi; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + useRoi = GET_PARAM(2); + + cv::cuda::setDevice(devInfo.deviceID()); + } +}; + +CUDA_TEST_P(Integral, Accuracy) +{ + cv::Mat src = randomMat(size, CV_8UC1); + + cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi); + cv::cuda::integral(loadMat(src, useRoi), dst); + + cv::Mat dst_gold; + cv::integral(src, dst_gold, CV_32S); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Integral, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + WHOLE_SUBMAT)); + +/////////////////////////////////////////////////////////////////////////////////////////////////////// +// IntegralSqr + +PARAM_TEST_CASE(IntegralSqr, cv::cuda::DeviceInfo, cv::Size, UseRoi) +{ + cv::cuda::DeviceInfo devInfo; + cv::Size size; + bool useRoi; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + size = GET_PARAM(1); + useRoi = GET_PARAM(2); + + cv::cuda::setDevice(devInfo.deviceID()); + } +}; + +CUDA_TEST_P(IntegralSqr, Accuracy) +{ + cv::Mat src = randomMat(size, CV_8UC1); + + cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_64FC1, useRoi); + cv::cuda::sqrIntegral(loadMat(src, useRoi), dst); + + cv::Mat dst_gold, temp; + cv::integral(src, temp, dst_gold); + + EXPECT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(CUDA_Arithm, IntegralSqr, testing::Combine( + ALL_DEVICES, + DIFFERENT_SIZES, + WHOLE_SUBMAT)); + #endif // HAVE_CUDA