diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 3081303ef..230a0f82c 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -680,6 +680,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
 
     bool aligned = isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16);
 
+#if CUDART_VERSION == 4000 
     if (aligned && src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0)
     {
         NppStreamHandler h(stream);
@@ -692,42 +693,48 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
         if (stream == 0)
             cudaSafeCall( cudaDeviceSynchronize() );
     }
-    else if (aligned && src1.depth() == CV_8U)
+    else 
+#endif
     {
-        NppStreamHandler h(stream);
+        if (aligned && src1.depth() == CV_8U)
+        {
+            NppStreamHandler h(stream);
 
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+            nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
 
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+#if CUDART_VERSION == 4000 
+        else if (aligned && src1.depth() == CV_32S)
+        {
+            NppStreamHandler h(stream);
+
+            nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), static_cast<int>(src1.step), src2.ptr<Npp32s>(), static_cast<int>(src2.step), 
+                dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+#endif
+        else if (aligned && src1.depth() == CV_32F)
+        {
+            NppStreamHandler h(stream);
+
+            nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step), 
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        else
+        {
+            const func_t func = funcs[src1.depth()];
+            CV_Assert(func != 0);
+
+            func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
     }
-    else if (aligned && src1.depth() == CV_32S)
-    {
-        NppStreamHandler h(stream);
-
-        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), static_cast<int>(src1.step), src2.ptr<Npp32s>(), static_cast<int>(src2.step), 
-            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else if (aligned && src1.depth() == CV_32F)
-    {
-        NppStreamHandler h(stream);
-
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step), 
-            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else
-    {
-        const func_t func = funcs[src1.depth()];
-        CV_Assert(func != 0);
-
-        func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
     }
 }
 
diff --git a/modules/gpu/src/graphcuts.cpp b/modules/gpu/src/graphcuts.cpp
index f3c4a06e2..90ccadc0a 100644
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -77,8 +77,18 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
 
     NppStreamHandler h(stream);
 
+#if CUDART_VERSION > 4000 
+    NppiGraphcutState* pState;
+    nppSafeCall( nppiGraphcutInitAlloc(sznpp, &pState, buf.ptr<Npp8u>()) );
+    
+    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), pState) );
+
+    nppSafeCall( nppiGraphcutFree(pState) );
+#else
     nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
         static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), buf.ptr<Npp8u>()) );
+#endif
 
     if (stream == 0)
         cudaSafeCall( cudaDeviceSynchronize() );
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index e5ea90c6b..ed243a38e 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -935,6 +935,31 @@ void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)
 
 void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
 {
+#if CUDART_VERSION > 4000 
+    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1);
+
+    dst.create(src.size(), CV_32FC1);
+
+    NppiSize sz;
+    sz.width = src.cols;
+    sz.height = src.rows;
+
+    NppiRect nppRect;
+    nppRect.height = rect.height;
+    nppRect.width = rect.width;
+    nppRect.x = rect.x;
+    nppRect.y = rect.y;
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+#else
     CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_32FC1);
 
     dst.create(src.size(), CV_32FC1);
@@ -958,6 +983,7 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
 
     if (stream == 0)
         cudaSafeCall( cudaDeviceSynchronize() );
+#endif
 }
 
 
diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp
index 71ce0b701..10e4e8119 100644
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -117,7 +117,15 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
 
     DeviceBuffer dbuf(2);
 
+#if CUDART_VERSION > 4000 
+    int bufSize;
+    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
+
+    GpuMat buf(1, bufSize, CV_8UC1);
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
+#else
     nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, dbuf, (double*)dbuf + 1) );
+#endif
 
     cudaSafeCall( cudaDeviceSynchronize() );