diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index 3081303ef..230a0f82c 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -680,6 +680,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea bool aligned = isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16); +#if CUDART_VERSION == 4000 if (aligned && src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0) { NppStreamHandler h(stream); @@ -692,42 +693,48 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - else if (aligned && src1.depth() == CV_8U) + else +#endif { - NppStreamHandler h(stream); + if (aligned && src1.depth() == CV_8U) + { + NppStreamHandler h(stream); - nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); + nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz) ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +#if CUDART_VERSION == 4000 + else if (aligned && src1.depth() == CV_32S) + { + NppStreamHandler h(stream); + + nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +#endif + else if (aligned && src1.depth() == CV_32F) + { + NppStreamHandler h(stream); + + nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), + dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + else + { + const func_t func = funcs[src1.depth()]; + CV_Assert(func != 0); + + func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); } - else if (aligned && src1.depth() == CV_32S) - { - NppStreamHandler h(stream); - - nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - else if (aligned && src1.depth() == CV_32F) - { - NppStreamHandler h(stream); - - nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr(), static_cast(src1.step), src2.ptr(), static_cast(src2.step), - dst.ptr(), static_cast(dst.step), sz) ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - else - { - const func_t func = funcs[src1.depth()]; - CV_Assert(func != 0); - - func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); } } diff --git a/modules/gpu/src/graphcuts.cpp b/modules/gpu/src/graphcuts.cpp index f3c4a06e2..90ccadc0a 100644 --- a/modules/gpu/src/graphcuts.cpp +++ b/modules/gpu/src/graphcuts.cpp @@ -77,8 +77,18 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans NppStreamHandler h(stream); +#if CUDART_VERSION > 4000 + NppiGraphcutState* pState; + nppSafeCall( nppiGraphcutInitAlloc(sznpp, &pState, buf.ptr()) ); + + nppSafeCall( nppiGraphcut_32s8u(terminals.ptr(), leftTransp.ptr(), rightTransp.ptr(), top.ptr(), bottom.ptr(), + static_cast(terminals.step), static_cast(leftTransp.step), sznpp, labels.ptr(), static_cast(labels.step), pState) ); + + nppSafeCall( nppiGraphcutFree(pState) ); +#else nppSafeCall( nppiGraphcut_32s8u(terminals.ptr(), leftTransp.ptr(), rightTransp.ptr(), top.ptr(), bottom.ptr(), static_cast(terminals.step), static_cast(leftTransp.step), sznpp, labels.ptr(), static_cast(labels.step), buf.ptr()) ); +#endif if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index e5ea90c6b..ed243a38e 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -935,6 +935,31 @@ void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst) void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s) { +#if CUDART_VERSION > 4000 + CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1); + + dst.create(src.size(), CV_32FC1); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + NppiRect nppRect; + nppRect.height = rect.height; + nppRect.width = rect.width; + nppRect.x = rect.x; + nppRect.y = rect.y; + + cudaStream_t stream = StreamAccessor::getStream(s); + + NppStreamHandler h(stream); + + nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr(), static_cast(src.step), sqr.ptr(), static_cast(sqr.step), + dst.ptr(), static_cast(dst.step), sz, nppRect) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +#else CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_32FC1); dst.create(src.size(), CV_32FC1); @@ -958,6 +983,7 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); +#endif } diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp index 71ce0b701..10e4e8119 100644 --- a/modules/gpu/src/matrix_reductions.cpp +++ b/modules/gpu/src/matrix_reductions.cpp @@ -117,7 +117,15 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev) DeviceBuffer dbuf(2); +#if CUDART_VERSION > 4000 + int bufSize; + nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) ); + + GpuMat buf(1, bufSize, CV_8UC1); + nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), static_cast(src.step), sz, buf.ptr(), dbuf, (double*)dbuf + 1) ); +#else nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr(), static_cast(src.step), sz, dbuf, (double*)dbuf + 1) ); +#endif cudaSafeCall( cudaDeviceSynchronize() );