From 00e7816c1bf177887e4645ee2afa811bc72a395a Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 24 Dec 2014 13:33:17 +0300 Subject: [PATCH 1/6] add auxiliary functions to work with Input/Output arrays: they allow to perform asynchronous upload/download into temporary buffer to get valid GpuMat object --- .../include/opencv2/core/private.cuda.hpp | 10 ++++ modules/core/src/cuda_gpu_mat.cpp | 47 +++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp index a97388bd0..aaa777c91 100644 --- a/modules/core/include/opencv2/core/private.cuda.hpp +++ b/modules/core/include/opencv2/core/private.cuda.hpp @@ -106,6 +106,16 @@ namespace cv { namespace cuda GpuMat::Allocator* allocator_; }; + CV_EXPORTS GpuMat getInputMat(InputArray _src, Stream& stream); + + CV_EXPORTS GpuMat getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream); + static inline GpuMat getOutputMat(OutputArray _dst, Size size, int type, Stream& stream) + { + return getOutputMat(_dst, size.height, size.width, type, stream); + } + + CV_EXPORTS void syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream); + static inline void checkNppError(int code, const char* file, const int line, const char* func) { if (code < 0) diff --git a/modules/core/src/cuda_gpu_mat.cpp b/modules/core/src/cuda_gpu_mat.cpp index 4440d5853..8a7b236ac 100644 --- a/modules/core/src/cuda_gpu_mat.cpp +++ b/modules/core/src/cuda_gpu_mat.cpp @@ -342,6 +342,53 @@ void cv::cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr) } } +GpuMat cv::cuda::getInputMat(InputArray _src, Stream& stream) +{ + GpuMat src; + + if (_src.kind() == _InputArray::CUDA_GPU_MAT) + { + src = _src.getGpuMat(); + } + else if (!_src.empty()) + { + BufferPool pool(stream); + src = pool.getBuffer(_src.size(), _src.type()); + src.upload(_src, stream); + } + + return src; +} + +GpuMat cv::cuda::getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream) +{ + GpuMat dst; + + if (_dst.kind() == _InputArray::CUDA_GPU_MAT) + { + _dst.create(rows, cols, type); + dst = _dst.getGpuMat(); + } + else + { + BufferPool pool(stream); + dst = pool.getBuffer(rows, cols, type); + } + + return dst; +} + +void cv::cuda::syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream) +{ + if (_dst.kind() != _InputArray::CUDA_GPU_MAT) + { + if (stream) + dst.download(_dst, stream); + else + dst.download(_dst); + } +} + #ifndef HAVE_CUDA GpuMat::Allocator* cv::cuda::GpuMat::defaultAllocator() From 3d0410c147b011ab9dfc67fe6b34763d74e4135e Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 24 Dec 2014 13:35:23 +0300 Subject: [PATCH 2/6] remove reciprocal version of cuda::divide it might cause errors, due to implicit type conversion and another cuda::divide overload --- modules/cudaarithm/include/opencv2/cudaarithm.hpp | 6 ------ modules/cudaarithm/test/test_element_operations.cpp | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp index 98ebfbef8..be095b92e 100644 --- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp +++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp @@ -130,12 +130,6 @@ This function, in contrast to divide, uses a round-down rounding mode. */ CV_EXPORTS void divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); -//! computes element-wise weighted reciprocal of an array (dst = scale/src2) -static inline void divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null()) -{ - divide(src1, src2, dst, 1.0, dtype, stream); -} - /** @brief Computes per-element absolute difference of two matrices (or of a matrix and scalar). @param src1 First source matrix or scalar. diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp index 4a43d9d30..a4a16ea89 100644 --- a/modules/cudaarithm/test/test_element_operations.cpp +++ b/modules/cudaarithm/test/test_element_operations.cpp @@ -1329,7 +1329,7 @@ CUDA_TEST_P(Divide_Scalar_First, Accuracy) try { cv::cuda::GpuMat dst; - cv::cuda::divide(scale, loadMat(mat), dst, depth.second); + cv::cuda::divide(scale, loadMat(mat), dst, 1.0, depth.second); } catch (const cv::Exception& e) { @@ -1339,7 +1339,7 @@ CUDA_TEST_P(Divide_Scalar_First, Accuracy) else { cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi); - cv::cuda::divide(scale, loadMat(mat, useRoi), dst, depth.second); + cv::cuda::divide(scale, loadMat(mat, useRoi), dst, 1.0, depth.second); cv::Mat dst_gold; cv::divide(scale, mat, dst_gold, depth.second); From 7454189c2a9fadadb7f3ddfc8b24a2070b70cda6 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 24 Dec 2014 13:38:02 +0300 Subject: [PATCH 3/6] use new getInputMat/getOutputMat/syncOutput methods in cudaarithm routines --- modules/cudaarithm/src/arithm.cpp | 22 +++--- modules/cudaarithm/src/core.cpp | 6 +- modules/cudaarithm/src/cuda/add_weighted.cu | 28 ++++--- modules/cudaarithm/src/cuda/bitwise_mat.cu | 12 ++- .../cudaarithm/src/cuda/copy_make_border.cu | 10 ++- modules/cudaarithm/src/cuda/lut.cu | 9 ++- modules/cudaarithm/src/cuda/math.cu | 77 +++++++++--------- modules/cudaarithm/src/cuda/mul_spectrums.cu | 21 +++-- modules/cudaarithm/src/cuda/polar_cart.cu | 78 ++++++++++--------- modules/cudaarithm/src/cuda/reduce.cu | 10 ++- modules/cudaarithm/src/cuda/split_merge.cu | 22 +++--- modules/cudaarithm/src/cuda/threshold.cu | 13 ++-- modules/cudaarithm/src/cuda/transpose.cu | 9 ++- modules/cudaarithm/src/element_operations.cpp | 75 ++++++++++-------- 14 files changed, 220 insertions(+), 172 deletions(-) diff --git a/modules/cudaarithm/src/arithm.cpp b/modules/cudaarithm/src/arithm.cpp index 63246abd5..b2107dd1f 100644 --- a/modules/cudaarithm/src/arithm.cpp +++ b/modules/cudaarithm/src/arithm.cpp @@ -169,9 +169,9 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray #else // CUBLAS works with column-major matrices - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); - GpuMat src3 = _src3.getGpuMat(); + GpuMat src1 = getInputMat(_src1, stream); + GpuMat src2 = getInputMat(_src2, stream); + GpuMat src3 = getInputMat(_src3, stream); CV_Assert( src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2 ); CV_Assert( src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()) ); @@ -200,8 +200,7 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray CV_Assert( src1Size.width == src2Size.height ); CV_Assert( src3.empty() || src3Size == dstSize ); - _dst.create(dstSize, src1.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, dstSize, src1.type(), stream); if (beta != 0) { @@ -281,6 +280,8 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray } cublasSafeCall( cublasDestroy_v2(handle) ); + + syncOutput(dst, _dst, stream); #endif } @@ -297,7 +298,7 @@ void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, (void) stream; throw_no_cuda(); #else - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 ); @@ -462,16 +463,15 @@ namespace void ConvolutionImpl::convolve(InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream) { - GpuMat image = _image.getGpuMat(); - GpuMat templ = _templ.getGpuMat(); + GpuMat image = getInputMat(_image, _stream); + GpuMat templ = getInputMat(_templ, _stream); CV_Assert( image.type() == CV_32FC1 ); CV_Assert( templ.type() == CV_32FC1 ); create(image.size(), templ.size()); - _result.create(result_size, CV_32FC1); - GpuMat result = _result.getGpuMat(); + GpuMat result = getOutputMat(_result, result_size, CV_32FC1, _stream); cudaStream_t stream = StreamAccessor::getStream(_stream); @@ -520,6 +520,8 @@ namespace cufftSafeCall( cufftDestroy(planR2C) ); cufftSafeCall( cufftDestroy(planC2R) ); + + syncOutput(result, _result, _stream); } } diff --git a/modules/cudaarithm/src/core.cpp b/modules/cudaarithm/src/core.cpp index eb71d6a4e..7dd51f978 100644 --- a/modules/cudaarithm/src/core.cpp +++ b/modules/cudaarithm/src/core.cpp @@ -119,15 +119,17 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str {NppMirror::call, 0, NppMirror::call, NppMirror::call} }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F); CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4); _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream)); + + syncOutput(dst, _dst, stream); } #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/cudaarithm/src/cuda/add_weighted.cu b/modules/cudaarithm/src/cuda/add_weighted.cu index d5c00f607..929301076 100644 --- a/modules/cudaarithm/src/cuda/add_weighted.cu +++ b/modules/cudaarithm/src/cuda/add_weighted.cu @@ -50,7 +50,10 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace @@ -63,7 +66,7 @@ namespace __device__ __forceinline__ D operator ()(T1 a, T2 b) const { - return saturate_cast(a * alpha + b * beta + gamma); + return cudev::saturate_cast(a * alpha + b * beta + gamma); } }; @@ -555,8 +558,8 @@ void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, dou } }; - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); + GpuMat src1 = getInputMat(_src1, stream); + GpuMat src2 = getInputMat(_src2, stream); int sdepth1 = src1.depth(); int sdepth2 = src2.depth(); @@ -564,19 +567,18 @@ void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, dou ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2); const int cn = src1.channels(); - CV_DbgAssert( src2.size() == src1.size() && src2.channels() == cn ); - CV_DbgAssert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F ); + CV_Assert( src2.size() == src1.size() && src2.channels() == cn ); + CV_Assert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F ); - _dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn)); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src1.size(), CV_MAKE_TYPE(ddepth, cn), stream); - GpuMat src1_ = src1.reshape(1); - GpuMat src2_ = src2.reshape(1); - GpuMat dst_ = dst.reshape(1); + GpuMat src1_single = src1.reshape(1); + GpuMat src2_single = src2.reshape(1); + GpuMat dst_single = dst.reshape(1); if (sdepth1 > sdepth2) { - src1_.swap(src2_); + src1_single.swap(src2_single); std::swap(alpha, beta); std::swap(sdepth1, sdepth2); } @@ -586,7 +588,9 @@ void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, dou if (!func) CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); - func(src1_, alpha, src2_, beta, gamma, dst_, stream); + func(src1_single, alpha, src2_single, beta, gamma, dst_single, stream); + + syncOutput(dst, _dst, stream); } #endif diff --git a/modules/cudaarithm/src/cuda/bitwise_mat.cu b/modules/cudaarithm/src/cuda/bitwise_mat.cu index b2bf288be..f151c1a48 100644 --- a/modules/cudaarithm/src/cuda/bitwise_mat.cu +++ b/modules/cudaarithm/src/cuda/bitwise_mat.cu @@ -50,7 +50,10 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op); @@ -60,16 +63,15 @@ void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& m void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream) { - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); + GpuMat src = getInputMat(_src, stream); + GpuMat mask = getInputMat(_mask, stream); const int depth = src.depth(); CV_DbgAssert( depth <= CV_32F ); CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); if (mask.empty()) { @@ -125,6 +127,8 @@ void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, gridTransformUnary(vsrc, vdst, bit_not(), singleMaskChannels(globPtr(mask), src.channels()), stream); } } + + syncOutput(dst, _dst, stream); } ////////////////////////////////////////////////////////////////////////////// diff --git a/modules/cudaarithm/src/cuda/copy_make_border.cu b/modules/cudaarithm/src/cuda/copy_make_border.cu index f7dd91f98..ce9cda36c 100644 --- a/modules/cudaarithm/src/cuda/copy_make_border.cu +++ b/modules/cudaarithm/src/cuda/copy_make_border.cu @@ -50,7 +50,10 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace @@ -133,7 +136,7 @@ void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bo { copyMakeBorderImpl , 0 /*copyMakeBorderImpl*/, copyMakeBorderImpl , copyMakeBorderImpl } }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); const int depth = src.depth(); const int cn = src.channels(); @@ -141,8 +144,7 @@ void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bo CV_Assert( depth <= CV_32F && cn <= 4 ); CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP ); - _dst.create(src.rows + top + bottom, src.cols + left + right, src.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.rows + top + bottom, src.cols + left + right, src.type(), stream); const func_t func = funcs[depth][cn - 1]; @@ -150,6 +152,8 @@ void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bo CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); func(src, dst, top, left, borderType, value, stream); + + syncOutput(dst, _dst, stream); } #endif diff --git a/modules/cudaarithm/src/cuda/lut.cu b/modules/cudaarithm/src/cuda/lut.cu index 0b1fe8b0d..56efb8fa8 100644 --- a/modules/cudaarithm/src/cuda/lut.cu +++ b/modules/cudaarithm/src/cuda/lut.cu @@ -50,8 +50,10 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace @@ -165,7 +167,7 @@ namespace void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); const int cn = src.channels(); const int lut_cn = d_lut.channels(); @@ -173,8 +175,7 @@ namespace CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 ); CV_Assert( lut_cn == 1 || lut_cn == cn ); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); if (lut_cn == 1) { @@ -196,6 +197,8 @@ namespace dst3.assign(lut_(src3, tbl), stream); } + + syncOutput(dst, _dst, stream); } } diff --git a/modules/cudaarithm/src/cuda/math.cu b/modules/cudaarithm/src/cuda/math.cu index 39f822081..41d762f6a 100644 --- a/modules/cudaarithm/src/cuda/math.cu +++ b/modules/cudaarithm/src/cuda/math.cu @@ -50,7 +50,10 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace @@ -92,16 +95,15 @@ void cv::cuda::abs(InputArray _src, OutputArray _dst, Stream& stream) absMat }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); - const int depth = src.depth(); + CV_Assert( src.depth() <= CV_64F ); - CV_DbgAssert( depth <= CV_64F ); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + funcs[src.depth()](src.reshape(1), dst.reshape(1), stream); - funcs[depth](src.reshape(1), dst.reshape(1), stream); + syncOutput(dst, _dst, stream); } ////////////////////////////////////////////////////////////////////////////// @@ -113,7 +115,7 @@ namespace { __device__ __forceinline__ T operator ()(T x) const { - return saturate_cast(x * x); + return cudev::saturate_cast(x * x); } }; @@ -138,16 +140,15 @@ void cv::cuda::sqr(InputArray _src, OutputArray _dst, Stream& stream) sqrMat }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); - const int depth = src.depth(); + CV_Assert( src.depth() <= CV_64F ); - CV_DbgAssert( depth <= CV_64F ); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + funcs[src.depth()](src.reshape(1), dst.reshape(1), stream); - funcs[depth](src.reshape(1), dst.reshape(1), stream); + syncOutput(dst, _dst, stream); } ////////////////////////////////////////////////////////////////////////////// @@ -176,16 +177,15 @@ void cv::cuda::sqrt(InputArray _src, OutputArray _dst, Stream& stream) sqrtMat }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); - const int depth = src.depth(); + CV_Assert( src.depth() <= CV_64F ); - CV_DbgAssert( depth <= CV_64F ); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + funcs[src.depth()](src.reshape(1), dst.reshape(1), stream); - funcs[depth](src.reshape(1), dst.reshape(1), stream); + syncOutput(dst, _dst, stream); } //////////////////////////////////////////////////////////////////////// @@ -198,7 +198,7 @@ namespace __device__ __forceinline__ T operator ()(T x) const { exp_func f; - return saturate_cast(f(x)); + return cudev::saturate_cast(f(x)); } }; @@ -223,16 +223,15 @@ void cv::cuda::exp(InputArray _src, OutputArray _dst, Stream& stream) expMat }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); - const int depth = src.depth(); + CV_Assert( src.depth() <= CV_64F ); - CV_DbgAssert( depth <= CV_64F ); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + funcs[src.depth()](src.reshape(1), dst.reshape(1), stream); - funcs[depth](src.reshape(1), dst.reshape(1), stream); + syncOutput(dst, _dst, stream); } //////////////////////////////////////////////////////////////////////// @@ -261,16 +260,15 @@ void cv::cuda::log(InputArray _src, OutputArray _dst, Stream& stream) logMat }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); - const int depth = src.depth(); + CV_Assert( src.depth() <= CV_64F ); - CV_DbgAssert( depth <= CV_64F ); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + funcs[src.depth()](src.reshape(1), dst.reshape(1), stream); - funcs[depth](src.reshape(1), dst.reshape(1), stream); + syncOutput(dst, _dst, stream); } //////////////////////////////////////////////////////////////////////// @@ -284,7 +282,7 @@ namespace __device__ __forceinline__ T operator()(T e) const { - return saturate_cast(__powf((float)e, power)); + return cudev::saturate_cast(__powf((float)e, power)); } }; template struct PowOp : unary_function @@ -293,7 +291,7 @@ namespace __device__ __forceinline__ T operator()(T e) const { - T res = saturate_cast(__powf((float)e, power)); + T res = cudev::saturate_cast(__powf((float)e, power)); if ((e < 0) && (1 & static_cast(power))) res *= -1; @@ -344,16 +342,15 @@ void cv::cuda::pow(InputArray _src, double power, OutputArray _dst, Stream& stre powMat }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); - const int depth = src.depth(); + CV_Assert( src.depth() <= CV_64F ); - CV_DbgAssert(depth <= CV_64F); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + funcs[src.depth()](src.reshape(1), power, dst.reshape(1), stream); - funcs[depth](src.reshape(1), power, dst.reshape(1), stream); + syncOutput(dst, _dst, stream); } #endif diff --git a/modules/cudaarithm/src/cuda/mul_spectrums.cu b/modules/cudaarithm/src/cuda/mul_spectrums.cu index b06090481..bd62f9903 100644 --- a/modules/cudaarithm/src/cuda/mul_spectrums.cu +++ b/modules/cudaarithm/src/cuda/mul_spectrums.cu @@ -50,7 +50,10 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; ////////////////////////////////////////////////////////////////////////////// @@ -120,33 +123,33 @@ void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst { (void) flags; - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); + GpuMat src1 = getInputMat(_src1, stream); + GpuMat src2 = getInputMat(_src2, stream); CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 ); CV_Assert( src1.size() == src2.size() ); - _dst.create(src1.size(), CV_32FC2); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src1.size(), CV_32FC2, stream); if (conjB) gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), comlex_mul_conj(), stream); else gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), comlex_mul(), stream); + + syncOutput(dst, _dst, stream); } void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream) { (void) flags; - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); + GpuMat src1 = getInputMat(_src1, stream); + GpuMat src2 = getInputMat(_src2, stream); CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2); CV_Assert( src1.size() == src2.size() ); - _dst.create(src1.size(), CV_32FC2); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src1.size(), CV_32FC2, stream); if (conjB) { @@ -160,6 +163,8 @@ void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputAr op.scale = scale; gridTransformBinary(globPtr(src1), globPtr(src2), globPtr(dst), op, stream); } + + syncOutput(dst, _dst, stream); } #endif diff --git a/modules/cudaarithm/src/cuda/polar_cart.cu b/modules/cudaarithm/src/cuda/polar_cart.cu index 200b79c05..0a949b42e 100644 --- a/modules/cudaarithm/src/cuda/polar_cart.cu +++ b/modules/cudaarithm/src/cuda/polar_cart.cu @@ -50,55 +50,59 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; void cv::cuda::magnitude(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream) { - GpuMat x = _x.getGpuMat(); - GpuMat y = _y.getGpuMat(); + GpuMat x = getInputMat(_x, stream); + GpuMat y = getInputMat(_y, stream); - CV_DbgAssert( x.depth() == CV_32F ); - CV_DbgAssert( y.type() == x.type() && y.size() == x.size() ); + CV_Assert( x.depth() == CV_32F ); + CV_Assert( y.type() == x.type() && y.size() == x.size() ); - _dst.create(x.size(), CV_32FC1); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream); GpuMat_ xc(x.reshape(1)); GpuMat_ yc(y.reshape(1)); GpuMat_ magc(dst.reshape(1)); gridTransformBinary(xc, yc, magc, magnitude_func(), stream); + + syncOutput(dst, _dst, stream); } void cv::cuda::magnitudeSqr(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream) { - GpuMat x = _x.getGpuMat(); - GpuMat y = _y.getGpuMat(); + GpuMat x = getInputMat(_x, stream); + GpuMat y = getInputMat(_y, stream); - CV_DbgAssert( x.depth() == CV_32F ); - CV_DbgAssert( y.type() == x.type() && y.size() == x.size() ); + CV_Assert( x.depth() == CV_32F ); + CV_Assert( y.type() == x.type() && y.size() == x.size() ); - _dst.create(x.size(), CV_32FC1); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream); GpuMat_ xc(x.reshape(1)); GpuMat_ yc(y.reshape(1)); GpuMat_ magc(dst.reshape(1)); gridTransformBinary(xc, yc, magc, magnitude_sqr_func(), stream); + + syncOutput(dst, _dst, stream); } void cv::cuda::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleInDegrees, Stream& stream) { - GpuMat x = _x.getGpuMat(); - GpuMat y = _y.getGpuMat(); + GpuMat x = getInputMat(_x, stream); + GpuMat y = getInputMat(_y, stream); - CV_DbgAssert( x.depth() == CV_32F ); - CV_DbgAssert( y.type() == x.type() && y.size() == x.size() ); + CV_Assert( x.depth() == CV_32F ); + CV_Assert( y.type() == x.type() && y.size() == x.size() ); - _dst.create(x.size(), CV_32FC1); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream); GpuMat_ xc(x.reshape(1)); GpuMat_ yc(y.reshape(1)); @@ -108,21 +112,20 @@ void cv::cuda::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleI gridTransformBinary(xc, yc, anglec, direction_func(), stream); else gridTransformBinary(xc, yc, anglec, direction_func(), stream); + + syncOutput(dst, _dst, stream); } void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, OutputArray _angle, bool angleInDegrees, Stream& stream) { - GpuMat x = _x.getGpuMat(); - GpuMat y = _y.getGpuMat(); + GpuMat x = getInputMat(_x, stream); + GpuMat y = getInputMat(_y, stream); - CV_DbgAssert( x.depth() == CV_32F ); - CV_DbgAssert( y.type() == x.type() && y.size() == x.size() ); + CV_Assert( x.depth() == CV_32F ); + CV_Assert( y.type() == x.type() && y.size() == x.size() ); - _mag.create(x.size(), CV_32FC1); - GpuMat mag = _mag.getGpuMat(); - - _angle.create(x.size(), CV_32FC1); - GpuMat angle = _angle.getGpuMat(); + GpuMat mag = getOutputMat(_mag, x.size(), CV_32FC1, stream); + GpuMat angle = getOutputMat(_angle, x.size(), CV_32FC1, stream); GpuMat_ xc(x.reshape(1)); GpuMat_ yc(y.reshape(1)); @@ -147,6 +150,9 @@ void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, Outpu binaryTupleAdapter<0, 1>(direction_func())), stream); } + + syncOutput(mag, _mag, stream); + syncOutput(angle, _angle, stream); } namespace @@ -173,17 +179,14 @@ namespace void cv::cuda::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, OutputArray _y, bool angleInDegrees, Stream& _stream) { - GpuMat mag = _mag.getGpuMat(); - GpuMat angle = _angle.getGpuMat(); + GpuMat mag = getInputMat(_mag, _stream); + GpuMat angle = getInputMat(_angle, _stream); - CV_DbgAssert( angle.depth() == CV_32F ); - CV_DbgAssert( mag.empty() || (mag.type() == angle.type() && mag.size() == angle.size()) ); + CV_Assert( angle.depth() == CV_32F ); + CV_Assert( mag.empty() || (mag.type() == angle.type() && mag.size() == angle.size()) ); - _x.create(angle.size(), CV_32FC1); - GpuMat x = _x.getGpuMat(); - - _y.create(angle.size(), CV_32FC1); - GpuMat y = _y.getGpuMat(); + GpuMat x = getOutputMat(_x, angle.size(), CV_32FC1, _stream); + GpuMat y = getOutputMat(_y, angle.size(), CV_32FC1, _stream); GpuMat_ xc(x.reshape(1)); GpuMat_ yc(y.reshape(1)); @@ -204,6 +207,9 @@ void cv::cuda::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, O CV_CUDEV_SAFE_CALL( cudaGetLastError() ); + syncOutput(x, _x, _stream); + syncOutput(y, _y, _stream); + if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } diff --git a/modules/cudaarithm/src/cuda/reduce.cu b/modules/cudaarithm/src/cuda/reduce.cu index 2cb2dacc7..5fb90287a 100644 --- a/modules/cudaarithm/src/cuda/reduce.cu +++ b/modules/cudaarithm/src/cuda/reduce.cu @@ -50,7 +50,10 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace @@ -125,7 +128,7 @@ namespace void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); CV_Assert( src.channels() <= 4 ); CV_Assert( dim == 0 || dim == 1 ); @@ -134,8 +137,7 @@ void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, if (dtype < 0) dtype = src.depth(); - _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels())); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, 1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()), stream); if (dim == 0) { @@ -292,6 +294,8 @@ void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, func(src, dst, reduceOp, stream); } + + syncOutput(dst, _dst, stream); } #endif diff --git a/modules/cudaarithm/src/cuda/split_merge.cu b/modules/cudaarithm/src/cuda/split_merge.cu index 13d6a349f..5b3af1077 100644 --- a/modules/cudaarithm/src/cuda/split_merge.cu +++ b/modules/cudaarithm/src/cuda/split_merge.cu @@ -50,7 +50,10 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; //////////////////////////////////////////////////////////////////////// @@ -92,20 +95,18 @@ namespace void mergeImpl(const GpuMat* src, size_t n, cv::OutputArray _dst, Stream& stream) { - CV_DbgAssert( src != 0 ); - CV_DbgAssert( n > 0 && n <= 4 ); + CV_Assert( src != 0 ); + CV_Assert( n > 0 && n <= 4 ); const int depth = src[0].depth(); const cv::Size size = src[0].size(); -#ifdef _DEBUG for (size_t i = 0; i < n; ++i) { CV_Assert( src[i].size() == size ); CV_Assert( src[i].depth() == depth ); CV_Assert( src[i].channels() == 1 ); } -#endif if (n == 1) { @@ -123,8 +124,7 @@ namespace const int channels = static_cast(n); - _dst.create(size, CV_MAKE_TYPE(depth, channels)); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, size, CV_MAKE_TYPE(depth, channels), stream); const func_t func = funcs[channels - 2][CV_ELEM_SIZE(depth) / 2]; @@ -132,6 +132,8 @@ namespace CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type"); func(src, dst, stream); + + syncOutput(dst, _dst, stream); } } } @@ -203,12 +205,12 @@ namespace {SplitFunc<4, uchar>::call, SplitFunc<4, ushort>::call, SplitFunc<4, int>::call, 0, SplitFunc<4, double>::call} }; - CV_DbgAssert( dst != 0 ); + CV_Assert( dst != 0 ); const int depth = src.depth(); const int channels = src.channels(); - CV_DbgAssert( channels <= 4 ); + CV_Assert( channels <= 4 ); if (channels == 0) return; @@ -233,13 +235,13 @@ namespace void cv::cuda::split(InputArray _src, GpuMat* dst, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); splitImpl(src, dst, stream); } void cv::cuda::split(InputArray _src, std::vector& dst, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); dst.resize(src.channels()); if (src.channels() > 0) splitImpl(src, &dst[0], stream); diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu index 21665cbe7..a5b8f07ce 100644 --- a/modules/cudaarithm/src/cuda/threshold.cu +++ b/modules/cudaarithm/src/cuda/threshold.cu @@ -52,6 +52,8 @@ #include "opencv2/cudev.hpp" #include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace @@ -95,15 +97,14 @@ namespace double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); const int depth = src.depth(); - CV_DbgAssert( src.channels() == 1 && depth <= CV_64F ); - CV_DbgAssert( type <= 4 /*THRESH_TOZERO_INV*/ ); + CV_Assert( src.channels() == 1 && depth <= CV_64F ); + CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ ); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); if (depth == CV_32F && type == 2 /*THRESH_TRUNC*/) { @@ -142,6 +143,8 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou funcs[depth](src, dst, thresh, maxVal, type, stream); } + syncOutput(dst, _dst, stream); + return thresh; } diff --git a/modules/cudaarithm/src/cuda/transpose.cu b/modules/cudaarithm/src/cuda/transpose.cu index aa8500442..bfe50bd34 100644 --- a/modules/cudaarithm/src/cuda/transpose.cu +++ b/modules/cudaarithm/src/cuda/transpose.cu @@ -52,18 +52,19 @@ #include "opencv2/cudev.hpp" #include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); const size_t elemSize = src.elemSize(); CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 ); - _dst.create( src.cols, src.rows, src.type() ); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream); if (elemSize == 1) { @@ -87,6 +88,8 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream) { gridTranspose(globPtr(src), globPtr(dst), stream); } + + syncOutput(dst, _dst, stream); } #endif diff --git a/modules/cudaarithm/src/element_operations.cpp b/modules/cudaarithm/src/element_operations.cpp index 795d7ffaa..f88119502 100644 --- a/modules/cudaarithm/src/element_operations.cpp +++ b/modules/cudaarithm/src/element_operations.cpp @@ -107,11 +107,11 @@ namespace GpuMat src1; if (!isScalar1) - src1 = _src1.getGpuMat(); + src1 = getInputMat(_src1, stream); GpuMat src2; if (!isScalar2) - src2 = _src2.getGpuMat(); + src2 = getInputMat(_src2, stream); Mat scalar; if (isScalar1) @@ -126,7 +126,7 @@ namespace scalar.convertTo(Mat_(scalar.rows, scalar.cols, &val[0]), CV_64F); } - GpuMat mask = _mask.getGpuMat(); + GpuMat mask = getInputMat(_mask, stream); const int sdepth = src1.empty() ? src2.depth() : src1.depth(); const int cn = src1.empty() ? src2.channels() : src1.channels(); @@ -147,8 +147,7 @@ namespace CV_Error(Error::StsUnsupportedFormat, "The device doesn't support double"); } - _dst.create(size, CV_MAKE_TYPE(ddepth, cn)); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, size, CV_MAKE_TYPE(ddepth, cn), stream); if (isScalar1) mat_scalar_func(src2, val, true, dst, mask, scale, stream, op); @@ -156,6 +155,8 @@ namespace mat_scalar_func(src1, val, false, dst, mask, scale, stream, op); else mat_mat_func(src1, src2, dst, mask, scale, stream, op); + + syncOutput(dst, _dst, stream); } } @@ -196,27 +197,29 @@ void cv::cuda::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, do { if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1) { - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); + GpuMat src1 = getInputMat(_src1, stream); + GpuMat src2 = getInputMat(_src2, stream); CV_Assert( src1.size() == src2.size() ); - _dst.create(src1.size(), src1.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream); mulMat_8uc4_32f(src1, src2, dst, stream); + + syncOutput(dst, _dst, stream); } else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1) { - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); + GpuMat src1 = getInputMat(_src1, stream); + GpuMat src2 = getInputMat(_src2, stream); CV_Assert( src1.size() == src2.size() ); - _dst.create(src1.size(), src1.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream); mulMat_16sc4_32f(src1, src2, dst, stream); + + syncOutput(dst, _dst, stream); } else { @@ -237,27 +240,29 @@ void cv::cuda::divide(InputArray _src1, InputArray _src2, OutputArray _dst, doub { if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1) { - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); + GpuMat src1 = getInputMat(_src1, stream); + GpuMat src2 = getInputMat(_src2, stream); CV_Assert( src1.size() == src2.size() ); - _dst.create(src1.size(), src1.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream); divMat_8uc4_32f(src1, src2, dst, stream); + + syncOutput(dst, _dst, stream); } else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1) { - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); + GpuMat src1 = getInputMat(_src1, stream); + GpuMat src2 = getInputMat(_src2, stream); CV_Assert( src1.size() == src2.size() ); - _dst.create(src1.size(), src1.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream); divMat_16sc4_32f(src1, src2, dst, stream); + + syncOutput(dst, _dst, stream); } else { @@ -389,15 +394,16 @@ void cv::cuda::rshift(InputArray _src, Scalar_ val, OutputArray _dst, Strea {NppShift::call, 0, NppShift::call, NppShift::call}, }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); CV_Assert( src.depth() < CV_32F ); CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 ); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream)); + + syncOutput(dst, _dst, stream); } void cv::cuda::lshift(InputArray _src, Scalar_ val, OutputArray _dst, Stream& stream) @@ -412,15 +418,16 @@ void cv::cuda::lshift(InputArray _src, Scalar_ val, OutputArray _dst, Strea {NppShift::call, 0, NppShift::call, NppShift::call}, }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S ); CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 ); - _dst.create(src.size(), src.type()); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream); funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream)); + + syncOutput(dst, _dst, stream); } ////////////////////////////////////////////////////////////////////////////// @@ -475,22 +482,24 @@ namespace void cv::cuda::magnitude(InputArray _src, OutputArray _dst, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); - _dst.create(src.size(), CV_32FC1); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream); npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream)); + + syncOutput(dst, _dst, stream); } void cv::cuda::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); - _dst.create(src.size(), CV_32FC1); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream); npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream)); + + syncOutput(dst, _dst, stream); } #endif From a4e598f474b5986d0be9ba0b0fafcbf5ee82fe02 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 24 Dec 2014 13:39:37 +0300 Subject: [PATCH 4/6] use new BufferPool class for some cudaarithm routines --- modules/cudaarithm/src/arithm.cpp | 15 ++++++++++---- modules/cudaarithm/src/cuda/integral.cu | 27 ++++++++++++++++--------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/modules/cudaarithm/src/arithm.cpp b/modules/cudaarithm/src/arithm.cpp index b2107dd1f..08de4e428 100644 --- a/modules/cudaarithm/src/arithm.cpp +++ b/modules/cudaarithm/src/arithm.cpp @@ -315,13 +315,20 @@ void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, // We don't support real-to-real transform CV_Assert( is_complex_input || is_complex_output ); - GpuMat src_cont = src; - // Make sure here we work with the continuous input, // as CUFFT can't handle gaps - createContinuous(src.rows, src.cols, src.type(), src_cont); - if (src_cont.data != src.data) + GpuMat src_cont; + if (src.isContinuous()) + { + src_cont = src; + } + else + { + BufferPool pool(stream); + src_cont.allocator = pool.getAllocator(); + createContinuous(src.rows, src.cols, src.type(), src_cont); src.copyTo(src_cont, stream); + } Size dft_size_opt = dft_size; if (is_1d_input && !is_row_dft) diff --git a/modules/cudaarithm/src/cuda/integral.cu b/modules/cudaarithm/src/cuda/integral.cu index db554eb30..4a70ab0de 100644 --- a/modules/cudaarithm/src/cuda/integral.cu +++ b/modules/cudaarithm/src/cuda/integral.cu @@ -50,51 +50,58 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; //////////////////////////////////////////////////////////////////////// // integral -void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& stream) +void cv::cuda::integral(InputArray _src, OutputArray _dst, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); CV_Assert( src.type() == CV_8UC1 ); - GpuMat_& res = (GpuMat_&) buffer; + BufferPool pool(stream); + GpuMat_ res(src.size(), pool.getAllocator()); gridIntegral(globPtr(src), res, stream); - _dst.create(src.rows + 1, src.cols + 1, CV_32SC1); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.rows + 1, src.cols + 1, CV_32SC1, stream); dst.setTo(Scalar::all(0), stream); GpuMat inner = dst(Rect(1, 1, src.cols, src.rows)); res.copyTo(inner, stream); + + syncOutput(dst, _dst, stream); } ////////////////////////////////////////////////////////////////////////////// // sqrIntegral -void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& stream) +void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, Stream& stream) { - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); CV_Assert( src.type() == CV_8UC1 ); - GpuMat_& res = (GpuMat_&) buf; + BufferPool pool(Stream::Null()); + GpuMat_ res(pool.getBuffer(src.size(), CV_64FC1)); gridIntegral(sqr_(cvt_(globPtr(src))), res, stream); - _dst.create(src.rows + 1, src.cols + 1, CV_64FC1); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.rows + 1, src.cols + 1, CV_64FC1, stream); dst.setTo(Scalar::all(0), stream); GpuMat inner = dst(Rect(1, 1, src.cols, src.rows)); res.copyTo(inner, stream); + + syncOutput(dst, _dst, stream); } #endif From cd0e95de165d89359b9641e6735d9b1b7216b698 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 24 Dec 2014 13:40:33 +0300 Subject: [PATCH 5/6] refactor cudaarithm reductions: * remove overloads with explicit buffer, now BufferPool is used * added async versions for all reduce functions --- modules/cuda/src/cascadeclassifier.cpp | 3 +- .../cudaarithm/include/opencv2/cudaarithm.hpp | 193 ++---------- modules/cudaarithm/perf/perf_reductions.cpp | 34 +- modules/cudaarithm/src/cuda/countnonzero.cu | 59 ++-- modules/cudaarithm/src/cuda/minmax.cu | 138 +++++++-- modules/cudaarithm/src/cuda/minmaxloc.cu | 130 +++++--- modules/cudaarithm/src/cuda/norm.cu | 126 ++++++-- modules/cudaarithm/src/cuda/normalize.cu | 290 ++++++++++++++++++ modules/cudaarithm/src/cuda/sum.cu | 174 +++++++---- modules/cudaarithm/src/reductions.cpp | 200 ++++++------ modules/cudaarithm/test/test_reductions.cpp | 205 ++++++++++++- modules/cudabgsegm/src/fgd.cpp | 8 +- modules/cudafilters/src/filtering.cpp | 2 +- modules/cudaimgproc/src/gftt.cpp | 3 +- modules/cudaimgproc/src/match_template.cpp | 23 +- samples/gpu/performance/tests.cpp | 6 +- 16 files changed, 1075 insertions(+), 519 deletions(-) create mode 100644 modules/cudaarithm/src/cuda/normalize.cu diff --git a/modules/cuda/src/cascadeclassifier.cpp b/modules/cuda/src/cascadeclassifier.cpp index c4e987015..259712b89 100644 --- a/modules/cuda/src/cascadeclassifier.cpp +++ b/modules/cuda/src/cascadeclassifier.cpp @@ -454,11 +454,10 @@ public: // create sutable matrix headers GpuMat src = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height)); GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1)); - GpuMat buff = integralBuffer; // generate integral for scale cuda::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR); - cuda::integral(src, sint, buff); + cuda::integral(src, sint); // calculate job int totalWidth = level.workArea.width / step; diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp index be095b92e..6e475db98 100644 --- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp +++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp @@ -524,116 +524,53 @@ CV_EXPORTS void copyMakeBorder(InputArray src, OutputArray dst, int top, int bot @param src1 Source matrix. Any matrices except 64F are supported. @param normType Norm type. NORM_L1 , NORM_L2 , and NORM_INF are supported for now. @param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type. -@param buf Optional buffer to avoid extra memory allocations. It is resized automatically. @sa norm */ -CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask, GpuMat& buf); -/** @overload -uses new buffer, no mask -*/ -static inline double norm(InputArray src, int normType) -{ - GpuMat buf; - return norm(src, normType, GpuMat(), buf); -} -/** @overload -no mask -*/ -static inline double norm(InputArray src, int normType, GpuMat& buf) -{ - return norm(src, normType, GpuMat(), buf); -} +CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask = noArray()); +/** @overload */ +CV_EXPORTS void calcNorm(InputArray src, OutputArray dst, int normType, InputArray mask = noArray(), Stream& stream = Stream::Null()); /** @brief Returns the difference of two matrices. @param src1 Source matrix. Any matrices except 64F are supported. @param src2 Second source matrix (if any) with the same size and type as src1. @param normType Norm type. NORM_L1 , NORM_L2 , and NORM_INF are supported for now. -@param buf Optional buffer to avoid extra memory allocations. It is resized automatically. @sa norm */ -CV_EXPORTS double norm(InputArray src1, InputArray src2, GpuMat& buf, int normType=NORM_L2); -/** @overload -uses new buffer -*/ -static inline double norm(InputArray src1, InputArray src2, int normType=NORM_L2) -{ - GpuMat buf; - return norm(src1, src2, buf, normType); -} +CV_EXPORTS double norm(InputArray src1, InputArray src2, int normType=NORM_L2); +/** @overload */ +CV_EXPORTS void calcNormDiff(InputArray src1, InputArray src2, OutputArray dst, int normType=NORM_L2, Stream& stream = Stream::Null()); /** @brief Returns the sum of matrix elements. @param src Source image of any depth except for CV_64F . @param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type. -@param buf Optional buffer to avoid extra memory allocations. It is resized automatically. @sa sum */ -CV_EXPORTS Scalar sum(InputArray src, InputArray mask, GpuMat& buf); -/** @overload -uses new buffer, no mask -*/ -static inline Scalar sum(InputArray src) -{ - GpuMat buf; - return sum(src, GpuMat(), buf); -} -/** @overload -no mask -*/ -static inline Scalar sum(InputArray src, GpuMat& buf) -{ - return sum(src, GpuMat(), buf); -} +CV_EXPORTS Scalar sum(InputArray src, InputArray mask = noArray()); +/** @overload */ +CV_EXPORTS void calcSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null()); /** @brief Returns the sum of absolute values for matrix elements. @param src Source image of any depth except for CV_64F . @param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type. -@param buf Optional buffer to avoid extra memory allocations. It is resized automatically. */ -CV_EXPORTS Scalar absSum(InputArray src, InputArray mask, GpuMat& buf); -/** @overload -uses new buffer, no mask -*/ -static inline Scalar absSum(InputArray src) -{ - GpuMat buf; - return absSum(src, GpuMat(), buf); -} -/** @overload -no mask -*/ -static inline Scalar absSum(InputArray src, GpuMat& buf) -{ - return absSum(src, GpuMat(), buf); -} +CV_EXPORTS Scalar absSum(InputArray src, InputArray mask = noArray()); +/** @overload */ +CV_EXPORTS void calcAbsSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null()); /** @brief Returns the squared sum of matrix elements. @param src Source image of any depth except for CV_64F . @param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type. -@param buf Optional buffer to avoid extra memory allocations. It is resized automatically. */ -CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask, GpuMat& buf); -/** @overload -uses new buffer, no mask -*/ -static inline Scalar sqrSum(InputArray src) -{ - GpuMat buf; - return sqrSum(src, GpuMat(), buf); -} -/** @overload -no mask -*/ -static inline Scalar sqrSum(InputArray src, GpuMat& buf) -{ - return sqrSum(src, GpuMat(), buf); -} +CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask = noArray()); +/** @overload */ +CV_EXPORTS void calcSqrSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null()); /** @brief Finds global minimum and maximum matrix elements and returns their values. @@ -641,21 +578,14 @@ static inline Scalar sqrSum(InputArray src, GpuMat& buf) @param minVal Pointer to the returned minimum value. Use NULL if not required. @param maxVal Pointer to the returned maximum value. Use NULL if not required. @param mask Optional mask to select a sub-matrix. -@param buf Optional buffer to avoid extra memory allocations. It is resized automatically. The function does not work with CV_64F images on GPUs with the compute capability \< 1.3. @sa minMaxLoc */ -CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf); -/** @overload -uses new buffer -*/ -static inline void minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray()) -{ - GpuMat buf; - minMax(src, minVal, maxVal, mask, buf); -} +CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask = noArray()); +/** @overload */ +CV_EXPORTS void findMinMax(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null()); /** @brief Finds global minimum and maximum matrix elements and returns their values with locations. @@ -665,44 +595,28 @@ static inline void minMax(InputArray src, double* minVal, double* maxVal=0, Inpu @param minLoc Pointer to the returned minimum location. Use NULL if not required. @param maxLoc Pointer to the returned maximum location. Use NULL if not required. @param mask Optional mask to select a sub-matrix. -@param valbuf Optional values buffer to avoid extra memory allocations. It is resized -automatically. -@param locbuf Optional locations buffer to avoid extra memory allocations. It is resized -automatically. + The function does not work with CV_64F images on GPU with the compute capability \< 1.3. @sa minMaxLoc */ CV_EXPORTS void minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, - InputArray mask, GpuMat& valbuf, GpuMat& locbuf); -/** @overload -uses new buffer -*/ -static inline void minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, - InputArray mask=noArray()) -{ - GpuMat valBuf, locBuf; - minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf); -} + InputArray mask = noArray()); +/** @overload */ +CV_EXPORTS void findMinMaxLoc(InputArray src, OutputArray minMaxVals, OutputArray loc, + InputArray mask = noArray(), Stream& stream = Stream::Null()); /** @brief Counts non-zero matrix elements. @param src Single-channel source image. -@param buf Optional buffer to avoid extra memory allocations. It is resized automatically. The function does not work with CV_64F images on GPUs with the compute capability \< 1.3. @sa countNonZero */ -CV_EXPORTS int countNonZero(InputArray src, GpuMat& buf); -/** @overload -uses new buffer -*/ -static inline int countNonZero(const GpuMat& src) -{ - GpuMat buf; - return countNonZero(src, buf); -} +CV_EXPORTS int countNonZero(InputArray src); +/** @overload */ +CV_EXPORTS void countNonZero(InputArray src, OutputArray dst, Stream& stream = Stream::Null()); /** @brief Reduces a matrix to a vector. @@ -737,19 +651,12 @@ CV_EXPORTS void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, i @param mtx Source matrix. CV_8UC1 matrices are supported for now. @param mean Mean value. @param stddev Standard deviation value. -@param buf Optional buffer to avoid extra memory allocations. It is resized automatically. @sa meanStdDev */ -CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf); -/** @overload -uses new buffer -*/ -static inline void meanStdDev(InputArray src, Scalar& mean, Scalar& stddev) -{ - GpuMat buf; - meanStdDev(src, mean, stddev, buf); -} +CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev); +/** @overload */ +CV_EXPORTS void meanStdDev(InputArray mtx, OutputArray dst, Stream& stream = Stream::Null()); /** @brief Computes a standard deviation of integral images. @@ -773,64 +680,32 @@ normalization. @param dtype When negative, the output array has the same type as src; otherwise, it has the same number of channels as src and the depth =CV_MAT_DEPTH(dtype). @param mask Optional operation mask. -@param norm_buf Optional buffer to avoid extra memory allocations. It is resized automatically. -@param cvt_buf Optional buffer to avoid extra memory allocations. It is resized automatically. +@param stream Stream for the asynchronous version. @sa normalize */ CV_EXPORTS void normalize(InputArray src, OutputArray dst, double alpha, double beta, - int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf); -/** @overload -uses new buffers -*/ -static inline void normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0, - int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray()) -{ - GpuMat norm_buf; - GpuMat cvt_buf; - normalize(src, dst, alpha, beta, norm_type, dtype, mask, norm_buf, cvt_buf); -} + int norm_type, int dtype, InputArray mask = noArray(), + Stream& stream = Stream::Null()); /** @brief Computes an integral image. @param src Source image. Only CV_8UC1 images are supported for now. @param sum Integral image containing 32-bit unsigned integer values packed into CV_32SC1 . -@param buffer Optional buffer to avoid extra memory allocations. It is resized automatically. @param stream Stream for the asynchronous version. @sa integral */ -CV_EXPORTS void integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null()); -static inline void integralBuffered(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null()) -{ - integral(src, sum, buffer, stream); -} -/** @overload -uses new buffer -*/ -static inline void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null()) -{ - GpuMat buffer; - integral(src, sum, buffer, stream); -} +CV_EXPORTS void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null()); /** @brief Computes a squared integral image. @param src Source image. Only CV_8UC1 images are supported for now. @param sqsum Squared integral image containing 64-bit unsigned integer values packed into CV_64FC1 . -@param buf Optional buffer to avoid extra memory allocations. It is resized automatically. @param stream Stream for the asynchronous version. */ -CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null()); -/** @overload -uses new buffer -*/ -static inline void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null()) -{ - GpuMat buffer; - sqrIntegral(src, sqsum, buffer, stream); -} +CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null()); //! @} cudaarithm_reduce diff --git a/modules/cudaarithm/perf/perf_reductions.cpp b/modules/cudaarithm/perf/perf_reductions.cpp index 470df48a3..78699c0a7 100644 --- a/modules/cudaarithm/perf/perf_reductions.cpp +++ b/modules/cudaarithm/perf/perf_reductions.cpp @@ -108,10 +108,9 @@ PERF_TEST_P(Sz_Norm, NormDiff, { const cv::cuda::GpuMat d_src1(src1); const cv::cuda::GpuMat d_src2(src2); - cv::cuda::GpuMat d_buf; double gpu_dst; - TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src1, d_src2, d_buf, normType); + TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src1, d_src2, normType); SANITY_CHECK(gpu_dst); @@ -146,10 +145,9 @@ PERF_TEST_P(Sz_Depth_Cn, Sum, if (PERF_RUN_CUDA()) { const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat d_buf; cv::Scalar gpu_dst; - TEST_CYCLE() gpu_dst = cv::cuda::sum(d_src, d_buf); + TEST_CYCLE() gpu_dst = cv::cuda::sum(d_src); SANITY_CHECK(gpu_dst, 1e-5, ERROR_RELATIVE); } @@ -183,10 +181,9 @@ PERF_TEST_P(Sz_Depth_Cn, SumAbs, if (PERF_RUN_CUDA()) { const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat d_buf; cv::Scalar gpu_dst; - TEST_CYCLE() gpu_dst = cv::cuda::absSum(d_src, d_buf); + TEST_CYCLE() gpu_dst = cv::cuda::absSum(d_src); SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE); } @@ -216,10 +213,9 @@ PERF_TEST_P(Sz_Depth_Cn, SumSqr, if (PERF_RUN_CUDA()) { const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat d_buf; cv::Scalar gpu_dst; - TEST_CYCLE() gpu_dst = cv::cuda::sqrSum(d_src, d_buf); + TEST_CYCLE() gpu_dst = cv::cuda::sqrSum(d_src); SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE); } @@ -248,10 +244,9 @@ PERF_TEST_P(Sz_Depth, MinMax, if (PERF_RUN_CUDA()) { const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat d_buf; double gpu_minVal, gpu_maxVal; - TEST_CYCLE() cv::cuda::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::cuda::GpuMat(), d_buf); + TEST_CYCLE() cv::cuda::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::cuda::GpuMat()); SANITY_CHECK(gpu_minVal, 1e-10); SANITY_CHECK(gpu_maxVal, 1e-10); @@ -286,11 +281,10 @@ PERF_TEST_P(Sz_Depth, MinMaxLoc, if (PERF_RUN_CUDA()) { const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat d_valbuf, d_locbuf; double gpu_minVal, gpu_maxVal; cv::Point gpu_minLoc, gpu_maxLoc; - TEST_CYCLE() cv::cuda::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc, cv::cuda::GpuMat(), d_valbuf, d_locbuf); + TEST_CYCLE() cv::cuda::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc); SANITY_CHECK(gpu_minVal, 1e-10); SANITY_CHECK(gpu_maxVal, 1e-10); @@ -323,10 +317,9 @@ PERF_TEST_P(Sz_Depth, CountNonZero, if (PERF_RUN_CUDA()) { const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat d_buf; int gpu_dst = 0; - TEST_CYCLE() gpu_dst = cv::cuda::countNonZero(d_src, d_buf); + TEST_CYCLE() gpu_dst = cv::cuda::countNonZero(d_src); SANITY_CHECK(gpu_dst); } @@ -414,9 +407,8 @@ PERF_TEST_P(Sz_Depth_NormType, Normalize, { const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat dst; - cv::cuda::GpuMat d_norm_buf, d_cvt_buf; - TEST_CYCLE() cv::cuda::normalize(d_src, dst, alpha, beta, norm_type, type, cv::cuda::GpuMat(), d_norm_buf, d_cvt_buf); + TEST_CYCLE() cv::cuda::normalize(d_src, dst, alpha, beta, norm_type, type, cv::cuda::GpuMat()); CUDA_SANITY_CHECK(dst, 1e-6); } @@ -445,11 +437,10 @@ PERF_TEST_P(Sz, MeanStdDev, if (PERF_RUN_CUDA()) { const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat d_buf; cv::Scalar gpu_mean; cv::Scalar gpu_stddev; - TEST_CYCLE() cv::cuda::meanStdDev(d_src, gpu_mean, gpu_stddev, d_buf); + TEST_CYCLE() cv::cuda::meanStdDev(d_src, gpu_mean, gpu_stddev); SANITY_CHECK(gpu_mean); SANITY_CHECK(gpu_stddev); @@ -481,9 +472,8 @@ PERF_TEST_P(Sz, Integral, { const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat dst; - cv::cuda::GpuMat d_buf; - TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf); + TEST_CYCLE() cv::cuda::integral(d_src, dst); CUDA_SANITY_CHECK(dst); } @@ -511,9 +501,9 @@ PERF_TEST_P(Sz, IntegralSqr, if (PERF_RUN_CUDA()) { const cv::cuda::GpuMat d_src(src); - cv::cuda::GpuMat dst, buf; + cv::cuda::GpuMat dst; - TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf); + TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst); CUDA_SANITY_CHECK(dst); } diff --git a/modules/cudaarithm/src/cuda/countnonzero.cu b/modules/cudaarithm/src/cuda/countnonzero.cu index 5de260909..fb7324660 100644 --- a/modules/cudaarithm/src/cuda/countnonzero.cu +++ b/modules/cudaarithm/src/cuda/countnonzero.cu @@ -50,47 +50,64 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace { - template - int countNonZeroImpl(const GpuMat& _src, GpuMat& _buf) + template + void countNonZeroImpl(const GpuMat& _src, GpuMat& _dst, Stream& stream) { const GpuMat_& src = (const GpuMat_&) _src; - GpuMat_& buf = (GpuMat_&) _buf; + GpuMat_& dst = (GpuMat_&) _dst; - gridCountNonZero(src, buf); - - int data; - buf.download(cv::Mat(1, 1, buf.type(), &data)); - - return data; + gridCountNonZero(src, dst, stream); } } -int cv::cuda::countNonZero(InputArray _src, GpuMat& buf) +void cv::cuda::countNonZero(InputArray _src, OutputArray _dst, Stream& stream) { - typedef int (*func_t)(const GpuMat& _src, GpuMat& _buf); + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Stream& stream); static const func_t funcs[] = { - countNonZeroImpl, - countNonZeroImpl, - countNonZeroImpl, - countNonZeroImpl, - countNonZeroImpl, - countNonZeroImpl, - countNonZeroImpl + countNonZeroImpl, + countNonZeroImpl, + countNonZeroImpl, + countNonZeroImpl, + countNonZeroImpl, + countNonZeroImpl, + countNonZeroImpl, }; - GpuMat src = _src.getGpuMat(); + GpuMat src = getInputMat(_src, stream); + CV_Assert( src.depth() <= CV_64F ); CV_Assert( src.channels() == 1 ); - const func_t func = funcs[src.depth()]; + GpuMat dst = getOutputMat(_dst, 1, 1, CV_32SC1, stream); - return func(src, buf); + const func_t func = funcs[src.depth()]; + func(src, dst, stream); + + syncOutput(dst, _dst, stream); +} + +int cv::cuda::countNonZero(InputArray _src) +{ + Stream& stream = Stream::Null(); + + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, 1, CV_32SC1); + + countNonZero(_src, buf, stream); + + int data; + buf.download(Mat(1, 1, CV_32SC1, &data)); + + return data; } #endif diff --git a/modules/cudaarithm/src/cuda/minmax.cu b/modules/cudaarithm/src/cuda/minmax.cu index 084bed870..517427073 100644 --- a/modules/cudaarithm/src/cuda/minmax.cu +++ b/modules/cudaarithm/src/cuda/minmax.cu @@ -50,62 +50,140 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace { - template - void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal) + template + void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream) { - typedef typename SelectIf< - TypesEquals::value, - double, - typename SelectIf::value, float, int>::type - >::type work_type; - const GpuMat_& src = (const GpuMat_&) _src; - GpuMat_& buf = (GpuMat_&) _buf; + GpuMat_& dst = (GpuMat_&) _dst; if (mask.empty()) - gridFindMinMaxVal(src, buf); + gridFindMinMaxVal(src, dst, stream); else - gridFindMinMaxVal(src, buf, globPtr(mask)); + gridFindMinMaxVal(src, dst, globPtr(mask), stream); + } - work_type data[2]; - buf.download(cv::Mat(1, 2, buf.type(), data)); + template + void minMaxImpl(const GpuMat& src, const GpuMat& mask, double* minVal, double* maxVal) + { + BufferPool pool(Stream::Null()); + GpuMat buf(pool.getBuffer(1, 2, DataType::type)); - if (minVal) - *minVal = data[0]; + minMaxImpl(src, mask, buf, Stream::Null()); + + R data[2]; + buf.download(Mat(1, 2, buf.type(), data)); - if (maxVal) - *maxVal = data[1]; } } -void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf) +void cv::cuda::findMinMax(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream) { - typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal); + typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream); static const func_t funcs[] = { - minMaxImpl, - minMaxImpl, - minMaxImpl, - minMaxImpl, - minMaxImpl, - minMaxImpl, - minMaxImpl + minMaxImpl, + minMaxImpl, + minMaxImpl, + minMaxImpl, + minMaxImpl, + minMaxImpl, + minMaxImpl }; - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); + const GpuMat src = getInputMat(_src, stream); + const GpuMat mask = getInputMat(_mask, stream); CV_Assert( src.channels() == 1 ); - CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); + CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); + + const int src_depth = src.depth(); + const int dst_depth = src_depth < CV_32F ? CV_32S : src_depth; + + GpuMat dst = getOutputMat(_dst, 1, 2, dst_depth, stream); const func_t func = funcs[src.depth()]; + func(src, mask, dst, stream); - func(src, mask, buf, minVal, maxVal); + syncOutput(dst, _dst, stream); +} + +void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask) +{ + Stream& stream = Stream::Null(); + + HostMem dst; + findMinMax(_src, dst, _mask, stream); + + stream.waitForCompletion(); + + double vals[2]; + dst.createMatHeader().convertTo(Mat(1, 2, CV_64FC1, &vals[0]), CV_64F); + + if (minVal) + *minVal = vals[0]; + + if (maxVal) + *maxVal = vals[1]; +} + +namespace cv { namespace cuda { namespace internal { + +void findMaxAbs(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream); + +}}} + +namespace +{ + template + void findMaxAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream) + { + const GpuMat_& src = (const GpuMat_&) _src; + GpuMat_& dst = (GpuMat_&) _dst; + + if (mask.empty()) + gridFindMaxVal(abs_(src), dst, stream); + else + gridFindMaxVal(abs_(src), dst, globPtr(mask), stream); + } +} + +void cv::cuda::internal::findMaxAbs(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream); + static const func_t funcs[] = + { + findMaxAbsImpl, + findMaxAbsImpl, + findMaxAbsImpl, + findMaxAbsImpl, + findMaxAbsImpl, + findMaxAbsImpl, + findMaxAbsImpl + }; + + const GpuMat src = getInputMat(_src, stream); + const GpuMat mask = getInputMat(_mask, stream); + + CV_Assert( src.channels() == 1 ); + CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); + + const int src_depth = src.depth(); + const int dst_depth = src_depth < CV_32F ? CV_32S : src_depth; + + GpuMat dst = getOutputMat(_dst, 1, 1, dst_depth, stream); + + const func_t func = funcs[src.depth()]; + func(src, mask, dst, stream); + + syncOutput(dst, _dst, stream); } #endif diff --git a/modules/cudaarithm/src/cuda/minmaxloc.cu b/modules/cudaarithm/src/cuda/minmaxloc.cu index 6f8cc53d6..b7c5ec872 100644 --- a/modules/cudaarithm/src/cuda/minmaxloc.cu +++ b/modules/cudaarithm/src/cuda/minmaxloc.cu @@ -50,78 +50,110 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace { - template - void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc) + template + void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, Stream& stream) { - typedef typename SelectIf< - TypesEquals::value, - double, - typename SelectIf::value, float, int>::type - >::type work_type; - const GpuMat_& src = (const GpuMat_&) _src; - GpuMat_& valBuf = (GpuMat_&) _valBuf; + GpuMat_& valBuf = (GpuMat_&) _valBuf; GpuMat_& locBuf = (GpuMat_&) _locBuf; if (mask.empty()) - gridMinMaxLoc(src, valBuf, locBuf); + gridMinMaxLoc(src, valBuf, locBuf, stream); else - gridMinMaxLoc(src, valBuf, locBuf, globPtr(mask)); - - cv::Mat_ h_valBuf; - cv::Mat_ h_locBuf; - - valBuf.download(h_valBuf); - locBuf.download(h_locBuf); - - if (minVal) - *minVal = h_valBuf(0, 0); - - if (maxVal) - *maxVal = h_valBuf(1, 0); - - if (minLoc) - { - const int idx = h_locBuf(0, 0); - *minLoc = cv::Point(idx % src.cols, idx / src.cols); - } - - if (maxLoc) - { - const int idx = h_locBuf(1, 0); - *maxLoc = cv::Point(idx % src.cols, idx / src.cols); - } + gridMinMaxLoc(src, valBuf, locBuf, globPtr(mask), stream); } } -void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask, GpuMat& valBuf, GpuMat& locBuf) +void cv::cuda::findMinMaxLoc(InputArray _src, OutputArray _minMaxVals, OutputArray _loc, InputArray _mask, Stream& stream) { - typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc); + typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, Stream& stream); static const func_t funcs[] = { - minMaxLocImpl, - minMaxLocImpl, - minMaxLocImpl, - minMaxLocImpl, - minMaxLocImpl, - minMaxLocImpl, - minMaxLocImpl + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl, + minMaxLocImpl }; - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); + const GpuMat src = getInputMat(_src, stream); + const GpuMat mask = getInputMat(_mask, stream); CV_Assert( src.channels() == 1 ); - CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); + CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); - const func_t func = funcs[src.depth()]; + const int src_depth = src.depth(); - func(src, mask, valBuf, locBuf, minVal, maxVal, minLoc, maxLoc); + BufferPool pool(stream); + GpuMat valBuf(pool.getAllocator()); + GpuMat locBuf(pool.getAllocator()); + + const func_t func = funcs[src_depth]; + func(src, mask, valBuf, locBuf, stream); + + GpuMat minMaxVals = valBuf.colRange(0, 1); + GpuMat loc = locBuf.colRange(0, 1); + + if (_minMaxVals.kind() == _InputArray::CUDA_GPU_MAT) + { + minMaxVals.copyTo(_minMaxVals, stream); + } + else + { + minMaxVals.download(_minMaxVals, stream); + } + + if (_loc.kind() == _InputArray::CUDA_GPU_MAT) + { + loc.copyTo(_loc, stream); + } + else + { + loc.download(_loc, stream); + } +} + +void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask) +{ + Stream& stream = Stream::Null(); + + HostMem minMaxVals, locVals; + findMinMaxLoc(_src, minMaxVals, locVals, _mask, stream); + + stream.waitForCompletion(); + + double vals[2]; + minMaxVals.createMatHeader().convertTo(Mat(minMaxVals.size(), CV_64FC1, &vals[0]), CV_64F); + + int locs[2]; + locVals.createMatHeader().copyTo(Mat(locVals.size(), CV_32SC1, &locs[0])); + Size size = _src.size(); + cv::Point locs2D[] = { + cv::Point(locs[0] % size.width, locs[0] / size.width), + cv::Point(locs[1] % size.width, locs[1] / size.width), + }; + + if (minVal) + *minVal = vals[0]; + + if (maxVal) + *maxVal = vals[1]; + + if (minLoc) + *minLoc = locs2D[0]; + + if (maxLoc) + *maxLoc = locs2D[1]; } #endif diff --git a/modules/cudaarithm/src/cuda/norm.cu b/modules/cudaarithm/src/cuda/norm.cu index bda6b4581..baf76a6db 100644 --- a/modules/cudaarithm/src/cuda/norm.cu +++ b/modules/cudaarithm/src/cuda/norm.cu @@ -50,70 +50,140 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace { - double normDiffInf(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf) + void normDiffInf(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream) { const GpuMat_& src1 = (const GpuMat_&) _src1; const GpuMat_& src2 = (const GpuMat_&) _src2; - GpuMat_& buf = (GpuMat_&) _buf; + GpuMat_& dst = (GpuMat_&) _dst; - gridFindMinMaxVal(abs_(cvt_(src1) - cvt_(src2)), buf); - - int data[2]; - buf.download(cv::Mat(1, 2, buf.type(), data)); - - return data[1]; + gridFindMaxVal(abs_(cvt_(src1) - cvt_(src2)), dst, stream); } - double normDiffL1(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf) + void normDiffL1(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream) { const GpuMat_& src1 = (const GpuMat_&) _src1; const GpuMat_& src2 = (const GpuMat_&) _src2; - GpuMat_& buf = (GpuMat_&) _buf; + GpuMat_& dst = (GpuMat_&) _dst; - gridCalcSum(abs_(cvt_(src1) - cvt_(src2)), buf); - - int data; - buf.download(cv::Mat(1, 1, buf.type(), &data)); - - return data; + gridCalcSum(abs_(cvt_(src1) - cvt_(src2)), dst, stream); } - double normDiffL2(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf) + void normDiffL2(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream) { const GpuMat_& src1 = (const GpuMat_&) _src1; const GpuMat_& src2 = (const GpuMat_&) _src2; - GpuMat_& buf = (GpuMat_&) _buf; + GpuMat_& dst = (GpuMat_&) _dst; - gridCalcSum(sqr_(cvt_(src1) - cvt_(src2)), buf); + BufferPool pool(stream); + GpuMat_ buf(1, 1, pool.getAllocator()); - double data; - buf.download(cv::Mat(1, 1, buf.type(), &data)); - - return std::sqrt(data); + gridCalcSum(sqr_(cvt_(src1) - cvt_(src2)), buf, stream); + gridTransformUnary(buf, dst, sqrt_func(), stream); } } -double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType) +void cv::cuda::calcNormDiff(InputArray _src1, InputArray _src2, OutputArray _dst, int normType, Stream& stream) { - typedef double (*func_t)(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf); + typedef void (*func_t)(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream); static const func_t funcs[] = { 0, normDiffInf, normDiffL1, 0, normDiffL2 }; - GpuMat src1 = _src1.getGpuMat(); - GpuMat src2 = _src2.getGpuMat(); + GpuMat src1 = getInputMat(_src1, stream); + GpuMat src2 = getInputMat(_src2, stream); CV_Assert( src1.type() == CV_8UC1 ); CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() ); CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 ); - return funcs[normType](src1, src2, buf); + GpuMat dst = getOutputMat(_dst, 1, 1, normType == NORM_L2 ? CV_64FC1 : CV_32SC1, stream); + + const func_t func = funcs[normType]; + func(src1, src2, dst, stream); + + syncOutput(dst, _dst, stream); +} + +double cv::cuda::norm(InputArray _src1, InputArray _src2, int normType) +{ + Stream& stream = Stream::Null(); + + HostMem dst; + calcNormDiff(_src1, _src2, dst, normType, stream); + + stream.waitForCompletion(); + + double val; + dst.createMatHeader().convertTo(Mat(1, 1, CV_64FC1, &val), CV_64F); + + return val; +} + +namespace cv { namespace cuda { namespace internal { + +void normL2(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream); + +}}} + +namespace +{ + template + void normL2Impl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream) + { + const GpuMat_& src = (const GpuMat_&) _src; + GpuMat_& dst = (GpuMat_&) _dst; + + BufferPool pool(stream); + GpuMat_ buf(1, 1, pool.getAllocator()); + + if (mask.empty()) + { + gridCalcSum(sqr_(cvt_(src)), buf, stream); + } + else + { + gridCalcSum(sqr_(cvt_(src)), buf, globPtr(mask), stream); + } + + gridTransformUnary(buf, dst, sqrt_func(), stream); + } +} + +void cv::cuda::internal::normL2(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream); + static const func_t funcs[] = + { + normL2Impl, + normL2Impl, + normL2Impl, + normL2Impl, + normL2Impl, + normL2Impl, + normL2Impl + }; + + const GpuMat src = getInputMat(_src, stream); + const GpuMat mask = getInputMat(_mask, stream); + + CV_Assert( src.channels() == 1 ); + CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); + + GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC1, stream); + + const func_t func = funcs[src.depth()]; + func(src, mask, dst, stream); + + syncOutput(dst, _dst, stream); } #endif diff --git a/modules/cudaarithm/src/cuda/normalize.cu b/modules/cudaarithm/src/cuda/normalize.cu new file mode 100644 index 000000000..efbc94ecc --- /dev/null +++ b/modules/cudaarithm/src/cuda/normalize.cu @@ -0,0 +1,290 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/opencv_modules.hpp" + +#ifndef HAVE_OPENCV_CUDEV + +#error "opencv_cudev is required" + +#else + +#include "opencv2/cudaarithm.hpp" +#include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" + +using namespace cv; +using namespace cv::cuda; +using namespace cv::cudev; + +namespace { + +template +struct ConvertorMinMax : unary_function +{ + typedef typename LargerType::type larger_type1; + typedef typename LargerType::type larger_type2; + typedef typename LargerType::type scalar_type; + + scalar_type dmin, dmax; + const I* minMaxVals; + + __device__ R operator ()(typename TypeTraits::parameter_type src) const + { + const scalar_type smin = minMaxVals[0]; + const scalar_type smax = minMaxVals[1]; + + const scalar_type scale = (dmax - dmin) * (smax - smin > numeric_limits::epsilon() ? 1.0 / (smax - smin) : 0.0); + const scalar_type shift = dmin - smin * scale; + + return cudev::saturate_cast(scale * src + shift); + } +}; + +template +void normalizeMinMax(const GpuMat& _src, GpuMat& _dst, double a, double b, const GpuMat& mask, Stream& stream) +{ + const GpuMat_& src = (const GpuMat_&)_src; + GpuMat_& dst = (GpuMat_&)_dst; + + BufferPool pool(stream); + GpuMat_ minMaxVals(1, 2, pool.getAllocator()); + + if (mask.empty()) + { + gridFindMinMaxVal(src, minMaxVals, stream); + } + else + { + gridFindMinMaxVal(src, minMaxVals, globPtr(mask), stream); + } + + ConvertorMinMax cvt; + cvt.dmin = std::min(a, b); + cvt.dmax = std::max(a, b); + cvt.minMaxVals = minMaxVals[0]; + + if (mask.empty()) + { + gridTransformUnary(src, dst, cvt, stream); + } + else + { + dst.setTo(Scalar::all(0), stream); + gridTransformUnary(src, dst, cvt, globPtr(mask), stream); + } +} + +template +struct ConvertorNorm : unary_function +{ + typedef typename LargerType::type larger_type1; + typedef typename LargerType::type larger_type2; + typedef typename LargerType::type scalar_type; + + scalar_type a; + const I* normVal; + + __device__ R operator ()(typename TypeTraits::parameter_type src) const + { + sqrt_func sqrt; + + scalar_type scale = normL2 ? sqrt(*normVal) : *normVal; + scale = scale > numeric_limits::epsilon() ? a / scale : 0.0; + + return cudev::saturate_cast(scale * src); + } +}; + +template +void normalizeNorm(const GpuMat& _src, GpuMat& _dst, double a, int normType, const GpuMat& mask, Stream& stream) +{ + const GpuMat_& src = (const GpuMat_&)_src; + GpuMat_& dst = (GpuMat_&)_dst; + + BufferPool pool(stream); + GpuMat_ normVal(1, 1, pool.getAllocator()); + + if (normType == NORM_L1) + { + if (mask.empty()) + { + gridCalcSum(abs_(cvt_(src)), normVal, stream); + } + else + { + gridCalcSum(abs_(cvt_(src)), normVal, globPtr(mask), stream); + } + } + else if (normType == NORM_L2) + { + if (mask.empty()) + { + gridCalcSum(sqr_(cvt_(src)), normVal, stream); + } + else + { + gridCalcSum(sqr_(cvt_(src)), normVal, globPtr(mask), stream); + } + } + else // NORM_INF + { + if (mask.empty()) + { + gridFindMaxVal(abs_(cvt_(src)), normVal, stream); + } + else + { + gridFindMaxVal(abs_(cvt_(src)), normVal, globPtr(mask), stream); + } + } + + if (normType == NORM_L2) + { + ConvertorNorm cvt; + cvt.a = a; + cvt.normVal = normVal[0]; + + if (mask.empty()) + { + gridTransformUnary(src, dst, cvt, stream); + } + else + { + dst.setTo(Scalar::all(0), stream); + gridTransformUnary(src, dst, cvt, globPtr(mask), stream); + } + } + else + { + ConvertorNorm cvt; + cvt.a = a; + cvt.normVal = normVal[0]; + + if (mask.empty()) + { + gridTransformUnary(src, dst, cvt, stream); + } + else + { + dst.setTo(Scalar::all(0), stream); + gridTransformUnary(src, dst, cvt, globPtr(mask), stream); + } + } +} + +} // namespace + +void cv::cuda::normalize(InputArray _src, OutputArray _dst, double a, double b, int normType, int dtype, InputArray _mask, Stream& stream) +{ + typedef void (*func_minmax_t)(const GpuMat& _src, GpuMat& _dst, double a, double b, const GpuMat& mask, Stream& stream); + typedef void (*func_norm_t)(const GpuMat& _src, GpuMat& _dst, double a, int normType, const GpuMat& mask, Stream& stream); + + static const func_minmax_t funcs_minmax[] = + { + normalizeMinMax, + normalizeMinMax, + normalizeMinMax, + normalizeMinMax, + normalizeMinMax, + normalizeMinMax, + normalizeMinMax + }; + + static const func_norm_t funcs_norm[] = + { + normalizeNorm, + normalizeNorm, + normalizeNorm, + normalizeNorm, + normalizeNorm, + normalizeNorm, + normalizeNorm + }; + + CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_MINMAX ); + + const GpuMat src = getInputMat(_src, stream); + const GpuMat mask = getInputMat(_mask, stream); + + CV_Assert( src.channels() == 1 ); + CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) ); + + dtype = CV_MAT_DEPTH(dtype); + + const int src_depth = src.depth(); + const int tmp_depth = src_depth <= CV_32F ? CV_32F : src_depth; + + GpuMat dst; + if (dtype == tmp_depth) + { + _dst.create(src.size(), tmp_depth); + dst = getOutputMat(_dst, src.size(), tmp_depth, stream); + } + else + { + BufferPool pool(stream); + dst = pool.getBuffer(src.size(), tmp_depth); + } + + if (normType == NORM_MINMAX) + { + const func_minmax_t func = funcs_minmax[src_depth]; + func(src, dst, a, b, mask, stream); + } + else + { + const func_norm_t func = funcs_norm[src_depth]; + func(src, dst, a, normType, mask, stream); + } + + if (dtype == tmp_depth) + { + syncOutput(dst, _dst, stream); + } + else + { + dst.convertTo(_dst, dtype, stream); + } +} + +#endif diff --git a/modules/cudaarithm/src/cuda/sum.cu b/modules/cudaarithm/src/cuda/sum.cu index cced9c56e..016044903 100644 --- a/modules/cudaarithm/src/cuda/sum.cu +++ b/modules/cudaarithm/src/cuda/sum.cu @@ -50,126 +50,153 @@ #include "opencv2/cudaarithm.hpp" #include "opencv2/cudev.hpp" +#include "opencv2/core/private.cuda.hpp" +using namespace cv; +using namespace cv::cuda; using namespace cv::cudev; namespace { template - cv::Scalar sumImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf) + void sumImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream) { typedef typename MakeVec::type src_type; typedef typename MakeVec::type res_type; const GpuMat_& src = (const GpuMat_&) _src; - GpuMat_& buf = (GpuMat_&) _buf; + GpuMat_& dst = (GpuMat_&) _dst; if (mask.empty()) - gridCalcSum(src, buf); + gridCalcSum(src, dst, stream); else - gridCalcSum(src, buf, globPtr(mask)); - - cv::Scalar_ res; - cv::Mat res_mat(buf.size(), buf.type(), res.val); - buf.download(res_mat); - - return res; + gridCalcSum(src, dst, globPtr(mask), stream); } template - cv::Scalar sumAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf) + void sumAbsImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream) { typedef typename MakeVec::type src_type; typedef typename MakeVec::type res_type; const GpuMat_& src = (const GpuMat_&) _src; - GpuMat_& buf = (GpuMat_&) _buf; + GpuMat_& dst = (GpuMat_&) _dst; if (mask.empty()) - gridCalcSum(abs_(cvt_(src)), buf); + gridCalcSum(abs_(cvt_(src)), dst, stream); else - gridCalcSum(abs_(cvt_(src)), buf, globPtr(mask)); - - cv::Scalar_ res; - cv::Mat res_mat(buf.size(), buf.type(), res.val); - buf.download(res_mat); - - return res; + gridCalcSum(abs_(cvt_(src)), dst, globPtr(mask), stream); } template - cv::Scalar sumSqrImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf) + void sumSqrImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream) { typedef typename MakeVec::type src_type; typedef typename MakeVec::type res_type; const GpuMat_& src = (const GpuMat_&) _src; - GpuMat_& buf = (GpuMat_&) _buf; + GpuMat_& dst = (GpuMat_&) _dst; if (mask.empty()) - gridCalcSum(sqr_(cvt_(src)), buf); + gridCalcSum(sqr_(cvt_(src)), dst, stream); else - gridCalcSum(sqr_(cvt_(src)), buf, globPtr(mask)); - - cv::Scalar_ res; - cv::Mat res_mat(buf.size(), buf.type(), res.val); - buf.download(res_mat); - - return res; + gridCalcSum(sqr_(cvt_(src)), dst, globPtr(mask), stream); } } -cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf) +void cv::cuda::calcSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream) { - typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf); + typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream); static const func_t funcs[7][4] = { - {sumImpl, sumImpl, sumImpl, sumImpl}, - {sumImpl, sumImpl, sumImpl, sumImpl}, - {sumImpl, sumImpl, sumImpl, sumImpl}, - {sumImpl, sumImpl, sumImpl, sumImpl}, - {sumImpl, sumImpl, sumImpl, sumImpl}, - {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, + {sumImpl, sumImpl, sumImpl, sumImpl}, {sumImpl, sumImpl, sumImpl, sumImpl} }; - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); + const GpuMat src = getInputMat(_src, stream); + const GpuMat mask = getInputMat(_mask, stream); - CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); + CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); - const func_t func = funcs[src.depth()][src.channels() - 1]; + const int src_depth = src.depth(); + const int channels = src.channels(); - return func(src, mask, buf); + GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream); + + const func_t func = funcs[src_depth][channels - 1]; + func(src, dst, mask, stream); + + syncOutput(dst, _dst, stream); } -cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf) +cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask) { - typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf); + Stream& stream = Stream::Null(); + + HostMem dst; + calcSum(_src, dst, _mask, stream); + + stream.waitForCompletion(); + + cv::Scalar val; + dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F); + + return val; +} + +void cv::cuda::calcAbsSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream); static const func_t funcs[7][4] = { - {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, - {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, - {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, - {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, - {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, - {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, + {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl}, {sumAbsImpl, sumAbsImpl, sumAbsImpl, sumAbsImpl} }; - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); + const GpuMat src = getInputMat(_src, stream); + const GpuMat mask = getInputMat(_mask, stream); - CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); + CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); - const func_t func = funcs[src.depth()][src.channels() - 1]; + const int src_depth = src.depth(); + const int channels = src.channels(); - return func(src, mask, buf); + GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream); + + const func_t func = funcs[src_depth][channels - 1]; + func(src, dst, mask, stream); + + syncOutput(dst, _dst, stream); } -cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf) +cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask) { - typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf); + Stream& stream = Stream::Null(); + + HostMem dst; + calcAbsSum(_src, dst, _mask, stream); + + stream.waitForCompletion(); + + cv::Scalar val; + dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F); + + return val; +} + +void cv::cuda::calcSqrSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream); static const func_t funcs[7][4] = { {sumSqrImpl, sumSqrImpl, sumSqrImpl, sumSqrImpl}, @@ -181,14 +208,35 @@ cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf) {sumSqrImpl, sumSqrImpl, sumSqrImpl, sumSqrImpl} }; - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); + const GpuMat src = getInputMat(_src, stream); + const GpuMat mask = getInputMat(_mask, stream); - CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); + CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); - const func_t func = funcs[src.depth()][src.channels() - 1]; + const int src_depth = src.depth(); + const int channels = src.channels(); - return func(src, mask, buf); + GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream); + + const func_t func = funcs[src_depth][channels - 1]; + func(src, dst, mask, stream); + + syncOutput(dst, _dst, stream); +} + +cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask) +{ + Stream& stream = Stream::Null(); + + HostMem dst; + calcSqrSum(_src, dst, _mask, stream); + + stream.waitForCompletion(); + + cv::Scalar val; + dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F); + + return val; } #endif diff --git a/modules/cudaarithm/src/reductions.cpp b/modules/cudaarithm/src/reductions.cpp index c1e2af4ed..8d0add453 100644 --- a/modules/cudaarithm/src/reductions.cpp +++ b/modules/cudaarithm/src/reductions.cpp @@ -47,110 +47,106 @@ using namespace cv::cuda; #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) -double cv::cuda::norm(InputArray, int, InputArray, GpuMat&) { throw_no_cuda(); return 0.0; } -double cv::cuda::norm(InputArray, InputArray, GpuMat&, int) { throw_no_cuda(); return 0.0; } +double cv::cuda::norm(InputArray, int, InputArray) { throw_no_cuda(); return 0.0; } +void cv::cuda::calcNorm(InputArray, OutputArray, int, InputArray, Stream&) { throw_no_cuda(); } +double cv::cuda::norm(InputArray, InputArray, int) { throw_no_cuda(); return 0.0; } +void cv::cuda::calcNormDiff(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); } -Scalar cv::cuda::sum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); } -Scalar cv::cuda::absSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); } -Scalar cv::cuda::sqrSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); } +Scalar cv::cuda::sum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); } +void cv::cuda::calcSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } +Scalar cv::cuda::absSum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); } +void cv::cuda::calcAbsSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } +Scalar cv::cuda::sqrSum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); } +void cv::cuda::calcSqrSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } -void cv::cuda::minMax(InputArray, double*, double*, InputArray, GpuMat&) { throw_no_cuda(); } -void cv::cuda::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); } +void cv::cuda::minMax(InputArray, double*, double*, InputArray) { throw_no_cuda(); } +void cv::cuda::findMinMax(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } +void cv::cuda::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray) { throw_no_cuda(); } +void cv::cuda::findMinMaxLoc(InputArray, OutputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } -int cv::cuda::countNonZero(InputArray, GpuMat&) { throw_no_cuda(); return 0; } +int cv::cuda::countNonZero(InputArray) { throw_no_cuda(); return 0; } +void cv::cuda::countNonZero(InputArray, OutputArray, Stream&) { throw_no_cuda(); } void cv::cuda::reduce(InputArray, OutputArray, int, int, int, Stream&) { throw_no_cuda(); } -void cv::cuda::meanStdDev(InputArray, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); } +void cv::cuda::meanStdDev(InputArray, Scalar&, Scalar&) { throw_no_cuda(); } +void cv::cuda::meanStdDev(InputArray, OutputArray, Stream&) { throw_no_cuda(); } void cv::cuda::rectStdDev(InputArray, InputArray, OutputArray, Rect, Stream&) { throw_no_cuda(); } -void cv::cuda::normalize(InputArray, OutputArray, double, double, int, int, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); } +void cv::cuda::normalize(InputArray, OutputArray, double, double, int, int, InputArray, Stream&) { throw_no_cuda(); } -void cv::cuda::integral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); } -void cv::cuda::sqrIntegral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); } +void cv::cuda::integral(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::cuda::sqrIntegral(InputArray, OutputArray, Stream&) { throw_no_cuda(); } #else -namespace -{ - class DeviceBuffer - { - public: - explicit DeviceBuffer(int count_ = 1) : count(count_) - { - cudaSafeCall( cudaMalloc(&pdev, count * sizeof(double)) ); - } - ~DeviceBuffer() - { - cudaSafeCall( cudaFree(pdev) ); - } - - operator double*() {return pdev;} - - void download(double* hptr) - { - double hbuf; - cudaSafeCall( cudaMemcpy(&hbuf, pdev, sizeof(double), cudaMemcpyDeviceToHost) ); - *hptr = hbuf; - } - void download(double** hptrs) - { - AutoBuffer hbuf(count); - cudaSafeCall( cudaMemcpy((void*)hbuf, pdev, count * sizeof(double), cudaMemcpyDeviceToHost) ); - for (int i = 0; i < count; ++i) - *hptrs[i] = hbuf[i]; - } - - private: - double* pdev; - int count; - }; -} - //////////////////////////////////////////////////////////////////////// // norm -double cv::cuda::norm(InputArray _src, int normType, InputArray _mask, GpuMat& buf) -{ - GpuMat src = _src.getGpuMat(); - GpuMat mask = _mask.getGpuMat(); +namespace cv { namespace cuda { namespace internal { +void normL2(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream); + +void findMaxAbs(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream); + +}}} + +void cv::cuda::calcNorm(InputArray _src, OutputArray dst, int normType, InputArray mask, Stream& stream) +{ CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 ); - CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1) ); + + GpuMat src = getInputMat(_src, stream); GpuMat src_single_channel = src.reshape(1); if (normType == NORM_L1) - return cuda::absSum(src_single_channel, mask, buf)[0]; + { + calcAbsSum(src_single_channel, dst, mask, stream); + } + else if (normType == NORM_L2) + { + internal::normL2(src_single_channel, dst, mask, stream); + } + else // NORM_INF + { + internal::findMaxAbs(src_single_channel, dst, mask, stream); + } +} - if (normType == NORM_L2) - return std::sqrt(cuda::sqrSum(src_single_channel, mask, buf)[0]); +double cv::cuda::norm(InputArray _src, int normType, InputArray _mask) +{ + Stream& stream = Stream::Null(); - // NORM_INF - double min_val, max_val; - cuda::minMax(src_single_channel, &min_val, &max_val, mask, buf); - return std::max(std::abs(min_val), std::abs(max_val)); + HostMem dst; + calcNorm(_src, dst, normType, _mask, stream); + + stream.waitForCompletion(); + + double val; + dst.createMatHeader().convertTo(Mat(1, 1, CV_64FC1, &val), CV_64F); + + return val; } //////////////////////////////////////////////////////////////////////// // meanStdDev -void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat& buf) +void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream) { - GpuMat src = _src.getGpuMat(); + if (!deviceSupports(FEATURE_SET_COMPUTE_13)) + CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility"); + + const GpuMat src = getInputMat(_src, stream); CV_Assert( src.type() == CV_8UC1 ); - if (!deviceSupports(FEATURE_SET_COMPUTE_13)) - CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility"); + GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream); NppiSize sz; sz.width = src.cols; sz.height = src.rows; - DeviceBuffer dbuf(2); - int bufSize; #if (CUDA_VERSION <= 4020) nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) ); @@ -158,14 +154,30 @@ void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat& nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) ); #endif - ensureSizeIsEnough(1, bufSize, CV_8UC1, buf); + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, bufSize, CV_8UC1); - nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), static_cast(src.step), sz, buf.ptr(), dbuf, (double*)dbuf + 1) ); + NppStreamHandler h(StreamAccessor::getStream(stream)); - cudaSafeCall( cudaDeviceSynchronize() ); + nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), static_cast(src.step), sz, buf.ptr(), dst.ptr(), dst.ptr() + 1) ); - double* ptrs[2] = {mean.val, stddev.val}; - dbuf.download(ptrs); + syncOutput(dst, _dst, stream); +} + +void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev) +{ + Stream& stream = Stream::Null(); + + HostMem dst; + meanStdDev(_src, dst, stream); + + stream.waitForCompletion(); + + double vals[2]; + dst.createMatHeader().copyTo(Mat(1, 2, CV_64FC1, &vals[0])); + + mean = Scalar(vals[0]); + stddev = Scalar(vals[1]); } ////////////////////////////////////////////////////////////////////////////// @@ -173,13 +185,12 @@ void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat& void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Rect rect, Stream& _stream) { - GpuMat src = _src.getGpuMat(); - GpuMat sqr = _sqr.getGpuMat(); + GpuMat src = getInputMat(_src, _stream); + GpuMat sqr = getInputMat(_sqr, _stream); CV_Assert( src.type() == CV_32SC1 && sqr.type() == CV_64FC1 ); - _dst.create(src.size(), CV_32FC1); - GpuMat dst = _dst.getGpuMat(); + GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, _stream); NppiSize sz; sz.width = src.cols; @@ -200,45 +211,8 @@ void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Re if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); -} -//////////////////////////////////////////////////////////////////////// -// normalize - -void cv::cuda::normalize(InputArray _src, OutputArray dst, double a, double b, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf) -{ - GpuMat src = _src.getGpuMat(); - - double scale = 1, shift = 0; - - if (norm_type == NORM_MINMAX) - { - double smin = 0, smax = 0; - double dmin = std::min(a, b), dmax = std::max(a, b); - cuda::minMax(src, &smin, &smax, mask, norm_buf); - scale = (dmax - dmin) * (smax - smin > std::numeric_limits::epsilon() ? 1.0 / (smax - smin) : 0.0); - shift = dmin - smin * scale; - } - else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF) - { - scale = cuda::norm(src, norm_type, mask, norm_buf); - scale = scale > std::numeric_limits::epsilon() ? a / scale : 0.0; - shift = 0; - } - else - { - CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type"); - } - - if (mask.empty()) - { - src.convertTo(dst, dtype, scale, shift); - } - else - { - src.convertTo(cvt_buf, dtype, scale, shift); - cvt_buf.copyTo(dst, mask); - } + syncOutput(dst, _dst, _stream); } #endif diff --git a/modules/cudaarithm/test/test_reductions.cpp b/modules/cudaarithm/test/test_reductions.cpp index e3c54055d..a95d007b8 100644 --- a/modules/cudaarithm/test/test_reductions.cpp +++ b/modules/cudaarithm/test/test_reductions.cpp @@ -74,8 +74,27 @@ CUDA_TEST_P(Norm, Accuracy) cv::Mat src = randomMat(size, depth); cv::Mat mask = randomMat(size, CV_8UC1, 0, 2); - cv::cuda::GpuMat d_buf; - double val = cv::cuda::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi), d_buf); + double val = cv::cuda::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi)); + + double val_gold = cv::norm(src, normCode, mask); + + EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0); +} + +CUDA_TEST_P(Norm, Async) +{ + cv::Mat src = randomMat(size, depth); + cv::Mat mask = randomMat(size, CV_8UC1, 0, 2); + + cv::cuda::Stream stream; + + cv::cuda::HostMem dst; + cv::cuda::calcNorm(loadMat(src, useRoi), dst, normCode, loadMat(mask, useRoi), stream); + + stream.waitForCompletion(); + + double val; + dst.createMatHeader().convertTo(cv::Mat(1, 1, CV_64FC1, &val), CV_64F); double val_gold = cv::norm(src, normCode, mask); @@ -127,6 +146,27 @@ CUDA_TEST_P(NormDiff, Accuracy) EXPECT_NEAR(val_gold, val, 0.0); } +CUDA_TEST_P(NormDiff, Async) +{ + cv::Mat src1 = randomMat(size, CV_8UC1); + cv::Mat src2 = randomMat(size, CV_8UC1); + + cv::cuda::Stream stream; + + cv::cuda::HostMem dst; + cv::cuda::calcNormDiff(loadMat(src1, useRoi), loadMat(src2, useRoi), dst, normCode, stream); + + stream.waitForCompletion(); + + double val; + const cv::Mat val_mat(1, 1, CV_64FC1, &val); + dst.createMatHeader().convertTo(val_mat, CV_64F); + + double val_gold = cv::norm(src1, src2, normCode); + + EXPECT_NEAR(val_gold, val, 0.0); +} + INSTANTIATE_TEST_CASE_P(CUDA_Arithm, NormDiff, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, @@ -247,6 +287,24 @@ CUDA_TEST_P(Sum, Simple) EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5); } +CUDA_TEST_P(Sum, Simple_Async) +{ + cv::cuda::Stream stream; + + cv::cuda::HostMem dst; + cv::cuda::calcSum(loadMat(src, useRoi), dst, cv::noArray(), stream); + + stream.waitForCompletion(); + + cv::Scalar val; + cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val); + dst.createMatHeader().convertTo(val_mat, CV_64F); + + cv::Scalar val_gold = cv::sum(src); + + EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5); +} + CUDA_TEST_P(Sum, Abs) { cv::Scalar val = cv::cuda::absSum(loadMat(src, useRoi)); @@ -256,6 +314,24 @@ CUDA_TEST_P(Sum, Abs) EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5); } +CUDA_TEST_P(Sum, Abs_Async) +{ + cv::cuda::Stream stream; + + cv::cuda::HostMem dst; + cv::cuda::calcAbsSum(loadMat(src, useRoi), dst, cv::noArray(), stream); + + stream.waitForCompletion(); + + cv::Scalar val; + cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val); + dst.createMatHeader().convertTo(val_mat, CV_64F); + + cv::Scalar val_gold = absSumGold(src); + + EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5); +} + CUDA_TEST_P(Sum, Sqr) { cv::Scalar val = cv::cuda::sqrSum(loadMat(src, useRoi)); @@ -265,6 +341,24 @@ CUDA_TEST_P(Sum, Sqr) EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5); } +CUDA_TEST_P(Sum, Sqr_Async) +{ + cv::cuda::Stream stream; + + cv::cuda::HostMem dst; + cv::cuda::calcSqrSum(loadMat(src, useRoi), dst, cv::noArray(), stream); + + stream.waitForCompletion(); + + cv::Scalar val; + cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val); + dst.createMatHeader().convertTo(val_mat, CV_64F); + + cv::Scalar val_gold = sqrSumGold(src); + + EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5); +} + INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Sum, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, @@ -321,6 +415,28 @@ CUDA_TEST_P(MinMax, WithoutMask) } } +CUDA_TEST_P(MinMax, Async) +{ + cv::Mat src = randomMat(size, depth); + + cv::cuda::Stream stream; + + cv::cuda::HostMem dst; + cv::cuda::findMinMax(loadMat(src, useRoi), dst, cv::noArray(), stream); + + stream.waitForCompletion(); + + double vals[2]; + const cv::Mat vals_mat(1, 2, CV_64FC1, &vals[0]); + dst.createMatHeader().convertTo(vals_mat, CV_64F); + + double minVal_gold, maxVal_gold; + minMaxLocGold(src, &minVal_gold, &maxVal_gold); + + EXPECT_DOUBLE_EQ(minVal_gold, vals[0]); + EXPECT_DOUBLE_EQ(maxVal_gold, vals[1]); +} + CUDA_TEST_P(MinMax, WithMask) { cv::Mat src = randomMat(size, depth); @@ -471,6 +587,41 @@ CUDA_TEST_P(MinMaxLoc, WithoutMask) } } +CUDA_TEST_P(MinMaxLoc, Async) +{ + cv::Mat src = randomMat(size, depth); + + cv::cuda::Stream stream; + + cv::cuda::HostMem minMaxVals, locVals; + cv::cuda::findMinMaxLoc(loadMat(src, useRoi), minMaxVals, locVals, cv::noArray(), stream); + + stream.waitForCompletion(); + + double vals[2]; + const cv::Mat vals_mat(2, 1, CV_64FC1, &vals[0]); + minMaxVals.createMatHeader().convertTo(vals_mat, CV_64F); + + int locs[2]; + const cv::Mat locs_mat(2, 1, CV_32SC1, &locs[0]); + locVals.createMatHeader().copyTo(locs_mat); + + cv::Point locs2D[] = { + cv::Point(locs[0] % src.cols, locs[0] / src.cols), + cv::Point(locs[1] % src.cols, locs[1] / src.cols), + }; + + double minVal_gold, maxVal_gold; + cv::Point minLoc_gold, maxLoc_gold; + minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold); + + EXPECT_DOUBLE_EQ(minVal_gold, vals[0]); + EXPECT_DOUBLE_EQ(maxVal_gold, vals[1]); + + expectEqual(src, minLoc_gold, locs2D[0]); + expectEqual(src, maxLoc_gold, locs2D[1]); +} + CUDA_TEST_P(MinMaxLoc, WithMask) { cv::Mat src = randomMat(size, depth); @@ -564,6 +715,7 @@ PARAM_TEST_CASE(CountNonZero, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi) int depth; bool useRoi; + cv::Mat src; virtual void SetUp() { @@ -573,15 +725,14 @@ PARAM_TEST_CASE(CountNonZero, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi) useRoi = GET_PARAM(3); cv::cuda::setDevice(devInfo.deviceID()); + + cv::Mat srcBase = randomMat(size, CV_8U, 0.0, 1.5); + srcBase.convertTo(src, depth); } }; CUDA_TEST_P(CountNonZero, Accuracy) { - cv::Mat srcBase = randomMat(size, CV_8U, 0.0, 1.5); - cv::Mat src; - srcBase.convertTo(src, depth); - if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE)) { try @@ -603,6 +754,24 @@ CUDA_TEST_P(CountNonZero, Accuracy) } } +CUDA_TEST_P(CountNonZero, Async) +{ + cv::cuda::Stream stream; + + cv::cuda::HostMem dst; + cv::cuda::countNonZero(loadMat(src, useRoi), dst, stream); + + stream.waitForCompletion(); + + int val; + const cv::Mat val_mat(1, 1, CV_32SC1, &val); + dst.createMatHeader().copyTo(val_mat); + + int val_gold = cv::countNonZero(src); + + ASSERT_EQ(val_gold, val); +} + INSTANTIATE_TEST_CASE_P(CUDA_Arithm, CountNonZero, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, @@ -750,7 +919,7 @@ CUDA_TEST_P(Normalize, WithMask) dst_gold.setTo(cv::Scalar::all(0)); cv::normalize(src, dst_gold, alpha, beta, norm_type, type, mask); - EXPECT_MAT_NEAR(dst_gold, dst, 1e-6); + EXPECT_MAT_NEAR(dst_gold, dst, type < CV_32F ? 1.0 : 1e-4); } INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Normalize, testing::Combine( @@ -811,6 +980,28 @@ CUDA_TEST_P(MeanStdDev, Accuracy) } } +CUDA_TEST_P(MeanStdDev, Async) +{ + cv::Mat src = randomMat(size, CV_8UC1); + + cv::cuda::Stream stream; + + cv::cuda::HostMem dst; + cv::cuda::meanStdDev(loadMat(src, useRoi), dst, stream); + + stream.waitForCompletion(); + + double vals[2]; + dst.createMatHeader().copyTo(cv::Mat(1, 2, CV_64FC1, &vals[0])); + + cv::Scalar mean_gold; + cv::Scalar stddev_gold; + cv::meanStdDev(src, mean_gold, stddev_gold); + + EXPECT_SCALAR_NEAR(mean_gold, cv::Scalar(vals[0]), 1e-5); + EXPECT_SCALAR_NEAR(stddev_gold, cv::Scalar(vals[1]), 1e-5); +} + INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, diff --git a/modules/cudabgsegm/src/fgd.cpp b/modules/cudabgsegm/src/fgd.cpp index 68f03a3e1..237f1c05f 100644 --- a/modules/cudabgsegm/src/fgd.cpp +++ b/modules/cudabgsegm/src/fgd.cpp @@ -266,7 +266,7 @@ namespace { int bgfgClassification(const GpuMat& prevFrame, const GpuMat& curFrame, const GpuMat& Ftd, const GpuMat& Fbd, - GpuMat& foreground, GpuMat& countBuf, + GpuMat& foreground, const FGDParams& params, int out_cn) { typedef void (*func_t)(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, @@ -298,7 +298,7 @@ namespace deltaC, deltaCC, params.alpha2, params.N1c, params.N1cc, 0); - int count = cuda::countNonZero(foreground, countBuf); + int count = cuda::countNonZero(foreground); cuda::multiply(foreground, Scalar::all(255), foreground); @@ -605,8 +605,6 @@ namespace GpuMat hist_; GpuMat histBuf_; - GpuMat countBuf_; - GpuMat buf_; GpuMat filterBrd_; @@ -649,7 +647,7 @@ namespace changeDetection(prevFrame_, curFrame, Ftd_, hist_, histBuf_); changeDetection(background_, curFrame, Fbd_, hist_, histBuf_); - int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, countBuf_, params_, 4); + int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, params_, 4); #ifdef HAVE_OPENCV_CUDAFILTERS if (params_.perform_morphing > 0) diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp index 2ab35ccee..ed72a3ab5 100644 --- a/modules/cudafilters/src/filtering.cpp +++ b/modules/cudafilters/src/filtering.cpp @@ -542,7 +542,7 @@ namespace anchor_ = Point(iters_, iters_); iters_ = 1; } - else if (iters_ > 1 && countNonZero(kernel) == (int) kernel.total()) + else if (iters_ > 1 && cv::countNonZero(kernel) == (int) kernel.total()) { anchor_ = Point(anchor_.x * iters_, anchor_.y * iters_); kernel = getStructuringElement(MORPH_RECT, diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp index 243665083..ab1f3e471 100644 --- a/modules/cudaimgproc/src/gftt.cpp +++ b/modules/cudaimgproc/src/gftt.cpp @@ -81,7 +81,6 @@ namespace GpuMat Dy_; GpuMat buf_; GpuMat eig_; - GpuMat minMaxbuf_; GpuMat tmpCorners_; }; @@ -109,7 +108,7 @@ namespace cornerCriteria_->compute(image, eig_); double maxVal = 0; - cuda::minMax(eig_, 0, &maxVal, noArray(), minMaxbuf_); + cuda::minMax(eig_, 0, &maxVal); ensureSizeIsEnough(1, std::max(1000, static_cast(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); diff --git a/modules/cudaimgproc/src/match_template.cpp b/modules/cudaimgproc/src/match_template.cpp index c5ab143ec..25c42dfd9 100644 --- a/modules/cudaimgproc/src/match_template.cpp +++ b/modules/cudaimgproc/src/match_template.cpp @@ -271,7 +271,6 @@ namespace private: Match_CCORR_8U match_CCORR_; GpuMat image_sqsums_; - GpuMat intBuffer_; }; void Match_CCORR_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream) @@ -288,7 +287,7 @@ namespace match_CCORR_.match(image, templ, _result, stream); GpuMat result = _result.getGpuMat(); - cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream); + cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream); double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0]; @@ -335,7 +334,6 @@ namespace private: GpuMat image_sqsums_; - GpuMat intBuffer_; Match_CCORR_8U match_CCORR_; }; @@ -359,7 +357,7 @@ namespace return; } - cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream); + cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream); double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0]; @@ -383,7 +381,6 @@ namespace private: GpuMat image_sqsums_; - GpuMat intBuffer_; Match_CCORR_8U match_CCORR_; }; @@ -398,7 +395,7 @@ namespace CV_Assert( image.type() == templ.type() ); CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows ); - cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream); + cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream); double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0]; @@ -421,7 +418,6 @@ namespace void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null()); private: - GpuMat intBuffer_; std::vector images_; std::vector image_sums_; Match_CCORR_8U match_CCORR_; @@ -444,7 +440,7 @@ namespace if (image.channels() == 1) { image_sums_.resize(1); - cuda::integral(image, image_sums_[0], intBuffer_, stream); + cuda::integral(image, image_sums_[0], stream); int templ_sum = (int) cuda::sum(templ)[0]; @@ -456,7 +452,7 @@ namespace image_sums_.resize(images_.size()); for (int i = 0; i < image.channels(); ++i) - cuda::integral(images_[i], image_sums_[i], intBuffer_, stream); + cuda::integral(images_[i], image_sums_[i], stream); Scalar templ_sum = cuda::sum(templ); @@ -501,7 +497,6 @@ namespace private: GpuMat imagef_, templf_; Match_CCORR_32F match_CCORR_32F_; - GpuMat intBuffer_; std::vector images_; std::vector image_sums_; std::vector image_sqsums_; @@ -527,10 +522,10 @@ namespace if (image.channels() == 1) { image_sums_.resize(1); - cuda::integral(image, image_sums_[0], intBuffer_, stream); + cuda::integral(image, image_sums_[0], stream); image_sqsums_.resize(1); - cuda::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream); + cuda::sqrIntegral(image, image_sqsums_[0], stream); int templ_sum = (int) cuda::sum(templ)[0]; double templ_sqsum = cuda::sqrSum(templ)[0]; @@ -547,8 +542,8 @@ namespace image_sqsums_.resize(images_.size()); for (int i = 0; i < image.channels(); ++i) { - cuda::integral(images_[i], image_sums_[i], intBuffer_, stream); - cuda::sqrIntegral(images_[i], image_sqsums_[i], intBuffer_, stream); + cuda::integral(images_[i], image_sums_[i], stream); + cuda::sqrIntegral(images_[i], image_sqsums_[i], stream); } Scalar templ_sum = cuda::sum(templ); diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index af3f874e1..0898031f0 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -193,7 +193,7 @@ TEST(cornerHarris) TEST(integral) { Mat src, sum; - cuda::GpuMat d_src, d_sum, d_buf; + cuda::GpuMat d_src, d_sum; for (int size = 1000; size <= 4000; size *= 2) { @@ -209,10 +209,10 @@ TEST(integral) d_src.upload(src); - cuda::integralBuffered(d_src, d_sum, d_buf); + cuda::integral(d_src, d_sum); CUDA_ON; - cuda::integralBuffered(d_src, d_sum, d_buf); + cuda::integral(d_src, d_sum); CUDA_OFF; } } From b4e7ee46c633b06e6c5ca8374b1bf22357c76fd1 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 24 Dec 2014 17:23:54 +0300 Subject: [PATCH 6/6] fix compilation without CUDA --- .../include/opencv2/core/private.cuda.hpp | 20 ++++++++--------- modules/core/src/cuda_gpu_mat.cpp | 22 +++++++++++++++++++ 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp index aaa777c91..5c473b857 100644 --- a/modules/core/include/opencv2/core/private.cuda.hpp +++ b/modules/core/include/opencv2/core/private.cuda.hpp @@ -80,6 +80,16 @@ namespace cv { namespace cuda { CV_EXPORTS cv::String getNppErrorMessage(int code); CV_EXPORTS cv::String getCudaDriverApiErrorMessage(int code); + + CV_EXPORTS GpuMat getInputMat(InputArray _src, Stream& stream); + + CV_EXPORTS GpuMat getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream); + static inline GpuMat getOutputMat(OutputArray _dst, Size size, int type, Stream& stream) + { + return getOutputMat(_dst, size.height, size.width, type, stream); + } + + CV_EXPORTS void syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream); }} #ifndef HAVE_CUDA @@ -106,16 +116,6 @@ namespace cv { namespace cuda GpuMat::Allocator* allocator_; }; - CV_EXPORTS GpuMat getInputMat(InputArray _src, Stream& stream); - - CV_EXPORTS GpuMat getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream); - static inline GpuMat getOutputMat(OutputArray _dst, Size size, int type, Stream& stream) - { - return getOutputMat(_dst, size.height, size.width, type, stream); - } - - CV_EXPORTS void syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream); - static inline void checkNppError(int code, const char* file, const int line, const char* func) { if (code < 0) diff --git a/modules/core/src/cuda_gpu_mat.cpp b/modules/core/src/cuda_gpu_mat.cpp index 8a7b236ac..9a17ddd85 100644 --- a/modules/core/src/cuda_gpu_mat.cpp +++ b/modules/core/src/cuda_gpu_mat.cpp @@ -346,6 +346,11 @@ GpuMat cv::cuda::getInputMat(InputArray _src, Stream& stream) { GpuMat src; +#ifndef HAVE_CUDA + (void) _src; + (void) stream; + throw_no_cuda(); +#else if (_src.kind() == _InputArray::CUDA_GPU_MAT) { src = _src.getGpuMat(); @@ -356,6 +361,7 @@ GpuMat cv::cuda::getInputMat(InputArray _src, Stream& stream) src = pool.getBuffer(_src.size(), _src.type()); src.upload(_src, stream); } +#endif return src; } @@ -364,6 +370,14 @@ GpuMat cv::cuda::getOutputMat(OutputArray _dst, int rows, int cols, int type, St { GpuMat dst; +#ifndef HAVE_CUDA + (void) _dst; + (void) rows; + (void) cols; + (void) type; + (void) stream; + throw_no_cuda(); +#else if (_dst.kind() == _InputArray::CUDA_GPU_MAT) { _dst.create(rows, cols, type); @@ -374,12 +388,19 @@ GpuMat cv::cuda::getOutputMat(OutputArray _dst, int rows, int cols, int type, St BufferPool pool(stream); dst = pool.getBuffer(rows, cols, type); } +#endif return dst; } void cv::cuda::syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream) { +#ifndef HAVE_CUDA + (void) dst; + (void) _dst; + (void) stream; + throw_no_cuda(); +#else if (_dst.kind() != _InputArray::CUDA_GPU_MAT) { if (stream) @@ -387,6 +408,7 @@ void cv::cuda::syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream) else dst.download(_dst); } +#endif } #ifndef HAVE_CUDA