diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index b8b8e4310..7e8389fcf 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -41,13 +41,6 @@ if (HAVE_CUDA)
     file(GLOB_RECURSE ncv_hdr1 "src/nvidia/*.hpp")
     file(GLOB_RECURSE ncv_hdr2 "src/nvidia/*.h")
 
-    if (NOT MSVC)
-        file(GLOB vvv "src/nvidia/*.cu")
-        list(GET vvv 0 vv)
-        list(REMOVE_ITEM ncv_cuda ${vv})
-    endif()
-    message(STATUS ${ncv_cuda})
-
     source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdr1} ${ncv_hdr2} ${ncv_cuda})
     include_directories("src/nvidia/core" "src/nvidia/NPP_staging")
 endif()
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 37acc59bd..511eaabac 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -47,7 +47,7 @@ using namespace cv::gpu;
 using namespace std;
 
 
-#if !defined (HAVE_CUDA) || (defined(_MSC_VER) && _MSC_VER != 1500) || !defined(_MSC_VER)
+#if !defined (HAVE_CUDA)
 
 cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()  { throw_nogpu(); }
 cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string&)  { throw_nogpu(); }
@@ -59,13 +59,6 @@ Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const { throw_nogpu();
 
 int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& , GpuMat& , double , int , Size)  { throw_nogpu(); return 0; }
 
-#if defined (HAVE_CUDA)
-	NCVStatus loadFromXML(const string&, HaarClassifierCascadeDescriptor&, vector<HaarStage64>&, 
-						  vector<HaarClassifierNode128>&, vector<HaarFeature64>&) { throw_nogpu(); return NCVStatus(); }
-
-	void groupRectangles(vector<NcvRect32u>&, int, double, vector<Ncv32u>*) { throw_nogpu(); }
-#endif
-
 #else
 
 struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
@@ -270,7 +263,11 @@ Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const
 }
                             
 int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
-{     
+{   
+	#if !defined(_MSC_VER)
+		CV_Assert(!"FD under not-VS2008 is not implemented");
+	#endif
+
     CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);
     CV_Assert( !this->empty());
         
diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
index 5abb91a3e..19d4ab488 100644
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -748,22 +748,25 @@ void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
                                                   NcvSize32u anchorsRoi, Ncv32u startStageInc,
                                                   Ncv32u endStageExc, Ncv32f scaleArea)
 {
+
+    applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
+                                                     d_IImg, IImgStride,
+                                                     d_weights, weightsStride,
+                                                     d_Features, d_ClassifierNodes, d_Stages,
+                                                     d_inMask, d_outMask,
+                                                     mask1Dlen, mask2Dstride,
+                                                     anchorsRoi, startStageInc,
+                                                     endStageExc, scaleArea);
+
     //Second parameter is the number of "dynamic" template parameters
     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
-        ::call( applyHaarClassifierAnchorParallelFunctor(gridConf, blockConf, cuStream,
-                                                         d_IImg, IImgStride,
-                                                         d_weights, weightsStride,
-                                                         d_Features, d_ClassifierNodes, d_Stages,
-                                                         d_inMask, d_outMask,
-                                                         mask1Dlen, mask2Dstride,
-                                                         anchorsRoi, startStageInc,
-                                                         endStageExc, scaleArea),
-        0xC001C0DE, //this is dummy int for the va_args C compatibility
-        tbInitMaskPositively,
-        tbCacheTextureIImg,
-        tbCacheTextureCascade,
-        tbReadPixelIndexFromVector,
-        tbDoAtomicCompaction);
+        ::call( &functor,
+                0xC001C0DE, //this is dummy int for the va_args C compatibility
+                tbInitMaskPositively,
+                tbCacheTextureIImg,
+                tbCacheTextureCascade,
+                tbReadPixelIndexFromVector,
+                tbDoAtomicCompaction);
 }
 
 
@@ -851,20 +854,22 @@ void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg
                                                       NcvSize32u anchorsRoi, Ncv32u startStageInc,
                                                       Ncv32u endStageExc, Ncv32f scaleArea)
 {
+    applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
+                                                         d_IImg, IImgStride,
+                                                         d_weights, weightsStride,
+                                                         d_Features, d_ClassifierNodes, d_Stages,
+                                                         d_inMask, d_outMask,
+                                                         mask1Dlen, mask2Dstride,
+                                                         anchorsRoi, startStageInc,
+                                                         endStageExc, scaleArea);
+
     //Second parameter is the number of "dynamic" template parameters
     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
-        ::call( applyHaarClassifierClassifierParallelFunctor(gridConf, blockConf, cuStream,
-                                                             d_IImg, IImgStride,
-                                                             d_weights, weightsStride,
-                                                             d_Features, d_ClassifierNodes, d_Stages,
-                                                             d_inMask, d_outMask,
-                                                             mask1Dlen, mask2Dstride,
-                                                             anchorsRoi, startStageInc,
-                                                             endStageExc, scaleArea),
-        0xC001C0DE, //this is dummy int for the va_args C compatibility
-        tbCacheTextureIImg,
-        tbCacheTextureCascade,
-        tbDoAtomicCompaction);
+        ::call( &functor,
+                0xC001C0DE, //this is dummy int for the va_args C compatibility
+                tbCacheTextureIImg,
+                tbCacheTextureCascade,
+                tbDoAtomicCompaction);
 }
 
 
@@ -920,15 +925,17 @@ void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
                                      NcvSize32u anchorsRoi, Ncv32u step)
 {
+    initializeMaskVectorFunctor functor(gridConf, blockConf, cuStream,
+                                        d_inMask, d_outMask,
+                                        mask1Dlen, mask2Dstride,
+                                        anchorsRoi, step);
+
     //Second parameter is the number of "dynamic" template parameters
     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
-        ::call( initializeMaskVectorFunctor(gridConf, blockConf, cuStream,
-                                            d_inMask, d_outMask,
-                                            mask1Dlen, mask2Dstride,
-                                            anchorsRoi, step),
-        0xC001C0DE, //this is dummy int for the va_args C compatibility
-        tbMaskByInmask,
-        tbDoAtomicCompaction);
+        ::call( &functor,
+                0xC001C0DE, //this is dummy int for the va_args C compatibility
+                tbMaskByInmask,
+                tbDoAtomicCompaction);
 }
 
 
diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
index 23795873f..bbe5a7708 100644
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -166,12 +166,6 @@ struct T_false {};
 template <typename T, typename U> struct is_same : T_false {};
 template <typename T> struct is_same<T, T> : T_true {};
 
-template <int v>
-struct Int2Type
-{
-    enum { value = v };
-};
-
 
 template<class T_in, class T_out>
 struct _scanElemOp
@@ -179,14 +173,16 @@ struct _scanElemOp
     template<bool tbDoSqr>
     static inline __host__ __device__ T_out scanElemOp(T_in elem)
     {
-        return scanElemOp_( elem, Int2Type<(int)tbDoSqr>() );
+        return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
     }
 private:
-    static inline __host__ __device__ T_out scanElemOp_(T_in elem,const Int2Type<0>&)
+    template <int v> struct Int2Type { enum { value = v }; };
+
+    static inline __host__ __device__ T_out scanElemOp(T_in elem,const Int2Type<0>&)
     {
         return (T_out)elem;
     }
-    static inline __host__ __device__ T_out scanElemOp_(T_in elem, const Int2Type<1>&)
+    static inline __host__ __device__ T_out scanElemOp(T_in elem, const Int2Type<1>&)
     {
         return (T_out)(elem*elem);
     }
diff --git a/modules/gpu/src/nvidia/core/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp
index 81eb417fb..9ec7743df 100644
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -523,7 +523,7 @@ public:
         clear();
     }
 
-	virtual ~NCVVector() {}
+    virtual ~NCVVector() {}
 
     void clear()
     {
@@ -579,7 +579,7 @@ class NCVVectorAlloc : public NCVVector<T>
 {
     NCVVectorAlloc();
     NCVVectorAlloc(const NCVVectorAlloc &);
-	NCVVectorAlloc& operator=(const NCVVectorAlloc<T>&);	
+    NCVVectorAlloc& operator=(const NCVVectorAlloc<T>&);	
 
 public:
 
@@ -701,7 +701,7 @@ public:
         clear();
     }
 
-	virtual ~NCVMatrix() {}
+    virtual ~NCVMatrix() {}
 
 
     void clear()
@@ -772,7 +772,7 @@ class NCVMatrixAlloc : public NCVMatrix<T>
 {
     NCVMatrixAlloc();
     NCVMatrixAlloc(const NCVMatrixAlloc &);
-	NCVMatrixAlloc& operator=(const NCVMatrixAlloc &);
+    NCVMatrixAlloc& operator=(const NCVMatrixAlloc &);
 public:
 
     NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u pitch=0)
diff --git a/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
index ea29e660c..30f3e816a 100644
--- a/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
@@ -146,10 +146,10 @@ namespace NCVRuntimeTemplateBool
     {
         //Convenience function used by the user
         //Takes a variable argument list, transforms it into a list
-        static void call(Func &functor, Ncv32u dummy, ...)
+        static void call(Func *functor, int dummy, ...)
         {
             //Vector used to collect arguments
-            std::vector<NcvBool> templateParamList;
+            std::vector<int> templateParamList;
 
             //Variable argument list manipulation
             va_list listPointer;
@@ -157,18 +157,18 @@ namespace NCVRuntimeTemplateBool
             //Collect parameters into the list
             for(int i=0; i<NumArguments; i++)
             {
-                NcvBool val = va_arg(listPointer, NcvBool);
+                int val = va_arg(listPointer, int);
                 templateParamList.push_back(val);
             }
             va_end(listPointer);
 
             //Call the actual typelist building function
-            call(functor, templateParamList);
+            call(*functor, templateParamList);
         }
 
         //Actual function called recursively to build a typelist based
         //on a list of values
-        static void call( Func &functor, std::vector<NcvBool> &templateParamList)
+        static void call( Func &functor, std::vector<int> &templateParamList)
         {
             //Get current parameter value in the list
             NcvBool val = templateParamList[templateParamList.size() - 1];
@@ -205,7 +205,7 @@ namespace NCVRuntimeTemplateBool
             functor.call(TList()); //TList instantiated to get the method template parameter resolved
         }
 
-        static void call(Func &functor, std::vector<NcvBool> &templateParams)
+        static void call(Func &functor, std::vector<int> &templateParams)
         {
             functor.call(TList());
         }
diff --git a/samples/gpu/cascadeclassifier_nvidia_api.cpp b/samples/gpu/cascadeclassifier_nvidia_api.cpp
index 0f2aec9b5..c56a559ef 100644
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@@ -2,8 +2,8 @@
 #include <cstdio>
 
 #include "cvconfig.h"
-#if !defined(HAVE_CUDA) || defined(__GNUC__)
-    int main( int argc, const char** argv ) { return printf("Please compile the librarary with CUDA support."), -1; }
+#if !defined(HAVE_CUDA)
+    int main( int argc, const char** argv ) { return printf("Please compile the library with CUDA support."), -1; }
 #else
 
 #include <cuda_runtime.h>