diff --git a/modules/nonfree/doc/feature_detection.rst b/modules/nonfree/doc/feature_detection.rst
index bb2f6b038..c7ccb7493 100644
--- a/modules/nonfree/doc/feature_detection.rst
+++ b/modules/nonfree/doc/feature_detection.rst
@@ -129,7 +129,6 @@ The function is parallelized with the TBB library.
 If you are using the C version, make sure you call ``cv::initModule_nonfree()`` from ``nonfree/nonfree.hpp``.
 
 
-
 gpu::SURF_GPU
 -------------
 .. ocv:class:: gpu::SURF_GPU
@@ -230,3 +229,102 @@ The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descripto
 The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
 
 .. seealso:: :ocv:class:`SURF`
+
+
+ocl::SURF_OCL
+-------------
+.. ocv:class:: ocl::SURF_OCL
+
+Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
+
+    class SURF_OCL
+    {
+    public:
+        enum KeypointLayout
+        {
+            X_ROW = 0,
+            Y_ROW,
+            LAPLACIAN_ROW,
+            OCTAVE_ROW,
+            SIZE_ROW,
+            ANGLE_ROW,
+            HESSIAN_ROW,
+            ROWS_COUNT
+        };
+
+        //! the default constructor
+        SURF_OCL();
+        //! the full constructor taking all the necessary parameters
+        explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
+             int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
+
+        //! returns the descriptor size in float's (64 or 128)
+        int descriptorSize() const;
+
+        //! upload host keypoints to device memory
+        void uploadKeypoints(const vector<KeyPoint>& keypoints,
+            oclMat& keypointsocl);
+        //! download keypoints from device to host memory
+        void downloadKeypoints(const oclMat& keypointsocl,
+            vector<KeyPoint>& keypoints);
+
+        //! download descriptors from device to host memory
+        void downloadDescriptors(const oclMat& descriptorsocl,
+            vector<float>& descriptors);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            oclMat& keypoints);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            oclMat& keypoints, oclMat& descriptors,
+            bool useProvidedKeypoints = false);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            std::vector<KeyPoint>& keypoints);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            std::vector<KeyPoint>& keypoints, oclMat& descriptors,
+            bool useProvidedKeypoints = false);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            std::vector<KeyPoint>& keypoints,
+            std::vector<float>& descriptors,
+            bool useProvidedKeypoints = false);
+
+        void releaseMemory();
+
+        // SURF parameters
+        double hessianThreshold;
+        int nOctaves;
+        int nOctaveLayers;
+        bool extended;
+        bool upright;
+
+        //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
+        float keypointsRatio;
+
+        oclMat sum, mask1, maskSum, intBuffer;
+
+        oclMat det, trace;
+
+        oclMat maxPosBuffer;
+    };
+
+
+The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
+
+The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
+
+* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
+* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
+* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]``  contains the laplacian sign of the i-th feature.
+* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
+* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
+* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
+* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
+
+The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
+
+The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
+
+.. seealso:: :ocv:class:`SURF`
\ No newline at end of file
diff --git a/modules/nonfree/include/opencv2/nonfree/ocl.hpp b/modules/nonfree/include/opencv2/nonfree/ocl.hpp
index aa2d01821..61b3c00a6 100644
--- a/modules/nonfree/include/opencv2/nonfree/ocl.hpp
+++ b/modules/nonfree/include/opencv2/nonfree/ocl.hpp
@@ -121,4 +121,4 @@ namespace cv
     }
 }
 
-#endif __OPENCV_NONFREE_OCL_HPP__
\ No newline at end of file
+#endif //__OPENCV_NONFREE_OCL_HPP__
\ No newline at end of file
diff --git a/modules/nonfree/perf/perf_main.cpp b/modules/nonfree/perf/perf_main.cpp
index 444ace981..de1242149 100644
--- a/modules/nonfree/perf/perf_main.cpp
+++ b/modules/nonfree/perf/perf_main.cpp
@@ -1,3 +1,4 @@
 #include "perf_precomp.hpp"
+#include "opencv2/ts/gpu_perf.hpp"
 
 CV_PERF_TEST_MAIN(nonfree, perf::printCudaInfo())
diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp
index 1e34a77db..d8336b938 100644
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -75,10 +75,11 @@ namespace cv
 }
 
 
-static inline int divUp(int total, int grain)
+static inline int divUp(size_t total, size_t grain)
 {
     return (total + grain - 1) / grain;
 }
+
 static inline int calcSize(int octave, int layer)
 {
     /* Wavelet size at first layer of first octave. */
@@ -505,20 +506,20 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
     size_t localThreads[3]  = {16, 16, 1};
     size_t globalThreads[3] =
     {
-        divUp(max_samples_j, localThreads[0]) *localThreads[0],
-        divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
+        divUp(max_samples_j, localThreads[0]) * localThreads[0],
+        divUp(max_samples_i, localThreads[1]) * localThreads[1] *(nOctaveLayers + 2),
         1
     };
     openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
-        int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
+        int octave, bool useMask, int nLayers, int layer_rows, int layer_cols)
 {
     const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
 
     Context *clCxt = det.clCxt;
-    string kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
+    string kernelName = useMask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
     vector< pair<size_t, const void *> > args;
 
     args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
@@ -537,7 +538,7 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
     args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates));
     args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
 
-    if(use_mask)
+    if(useMask)
     {
         if(maskSumTex)
         {
@@ -559,7 +560,7 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
 }
 
 void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
-        oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
+        oclMat &keypoints, oclMat &counters_, int octave, int layer_rows, int max_features)
 {
     Context *clCxt = det.clCxt;
     string kernelName = "icvInterpolateKeypoint";
@@ -568,14 +569,14 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
     args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void *)&counters.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&counters_.data));
     args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
     args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
     args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
     args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
     args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
     args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&maxFeatures));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&max_features));
 
     size_t localThreads[3]  = {3, 3, 3};
     size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
diff --git a/modules/nonfree/test/test_main.cpp b/modules/nonfree/test/test_main.cpp
index 4f6cfd3e5..c9e33a943 100644
--- a/modules/nonfree/test/test_main.cpp
+++ b/modules/nonfree/test/test_main.cpp
@@ -69,3 +69,5 @@ int main(int argc, char **argv)
 #else // HAVE_CUDA
 
 CV_TEST_MAIN("cv")
+
+#endif // HAVE_CUDA
diff --git a/modules/nonfree/test/test_surf.ocl.cpp b/modules/nonfree/test/test_surf.ocl.cpp
index 069c6ba98..76ed37de4 100644
--- a/modules/nonfree/test/test_surf.ocl.cpp
+++ b/modules/nonfree/test/test_surf.ocl.cpp
@@ -144,7 +144,7 @@ PARAM_TEST_CASE(SURF, HessianThreshold, Octaves, OctaveLayers, Extended, Upright
     }
 };
 
-TEST_P(SURF, Detector)
+TEST_P(SURF, DISABLED_Detector)
 {
     cv::Mat image  = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(image.empty());
diff --git a/modules/ocl/doc/object_detection.rst b/modules/ocl/doc/object_detection.rst
index 0104da593..17eb62d0e 100644
--- a/modules/ocl/doc/object_detection.rst
+++ b/modules/ocl/doc/object_detection.rst
@@ -88,102 +88,3 @@ Computes a proximity map for a raster template and an image where the template i
     * ``CV_TM_CCORR``
 
 .. seealso:: :ocv:func:`matchTemplate`
-
-
-ocl::SURF_OCL
--------------
-.. ocv:class:: ocl::SURF_OCL
-
-Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
-
-    class SURF_OCL
-    {
-    public:
-        enum KeypointLayout
-        {
-            X_ROW = 0,
-            Y_ROW,
-            LAPLACIAN_ROW,
-            OCTAVE_ROW,
-            SIZE_ROW,
-            ANGLE_ROW,
-            HESSIAN_ROW,
-            ROWS_COUNT
-        };
-
-        //! the default constructor
-        SURF_OCL();
-        //! the full constructor taking all the necessary parameters
-        explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
-             int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
-
-        //! returns the descriptor size in float's (64 or 128)
-        int descriptorSize() const;
-
-        //! upload host keypoints to device memory
-        void uploadKeypoints(const vector<KeyPoint>& keypoints,
-            oclMat& keypointsocl);
-        //! download keypoints from device to host memory
-        void downloadKeypoints(const oclMat& keypointsocl,
-            vector<KeyPoint>& keypoints);
-
-        //! download descriptors from device to host memory
-        void downloadDescriptors(const oclMat& descriptorsocl,
-            vector<float>& descriptors);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            oclMat& keypoints);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            oclMat& keypoints, oclMat& descriptors,
-            bool useProvidedKeypoints = false);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            std::vector<KeyPoint>& keypoints);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            std::vector<KeyPoint>& keypoints, oclMat& descriptors,
-            bool useProvidedKeypoints = false);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            std::vector<KeyPoint>& keypoints,
-            std::vector<float>& descriptors,
-            bool useProvidedKeypoints = false);
-
-        void releaseMemory();
-
-        // SURF parameters
-        double hessianThreshold;
-        int nOctaves;
-        int nOctaveLayers;
-        bool extended;
-        bool upright;
-
-        //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
-        float keypointsRatio;
-
-        oclMat sum, mask1, maskSum, intBuffer;
-
-        oclMat det, trace;
-
-        oclMat maxPosBuffer;
-    };
-
-
-The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
-
-The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
-
-* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
-* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
-* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]``  contains the laplacian sign of the i-th feature.
-* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
-* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
-* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
-* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
-
-The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
-
-The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
-
-.. seealso:: :ocv:class:`SURF`
\ No newline at end of file
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
index fd6591566..405d92ccd 100644
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -58,6 +58,12 @@ namespace cv
 {
     namespace ocl
     {
+        enum openCLMemcpyKind
+        {
+            clMemcpyHostToDevice = 0,
+            clMemcpyDeviceToHost,
+            clMemcpyDeviceToDevice
+        };
         ///////////////////////////OpenCL call wrappers////////////////////////////
         void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
                                           size_t widthInBytes, size_t height);
@@ -65,7 +71,7 @@ namespace cv
                                             size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
         void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
                                        const void *src, size_t spitch,
-                                       size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1);
+                                       size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
         void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
                                            const void *src, size_t spitch,
                                            size_t width, size_t height, int src_offset);
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index 3f4c31644..d3fc9c2a2 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -387,7 +387,7 @@ namespace cv
 
         void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
                             const void *src, size_t spitch,
-                            size_t width, size_t height, enum openCLMemcpyKind kind, int channels)
+                            size_t width, size_t height, openCLMemcpyKind kind, int channels)
         {
             size_t buffer_origin[3] = {0, 0, 0};
             size_t host_origin[3] = {0, 0, 0};
@@ -593,11 +593,11 @@ namespace cv
             size_t kernelWorkGroupSize;
             openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum],
                                                     CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
-            CV_Assert( (localThreads[0] <= clCxt->impl->maxWorkItemSizes[0]) &&
-                          (localThreads[1] <= clCxt->impl->maxWorkItemSizes[1]) &&
-                          (localThreads[2] <= clCxt->impl->maxWorkItemSizes[2]) &&
-                          ((localThreads[0] * localThreads[1] * localThreads[2]) <= kernelWorkGroupSize) &&
-                          (localThreads[0] * localThreads[1] * localThreads[2]) <= clCxt->impl->maxWorkGroupSize);
+            CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] );
+            CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] );
+            CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] );
+            CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
+            CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
         }
 
 #ifdef PRINT_KERNEL_RUN_TIME
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
index bc64fa24f..8b7e18764 100644
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -43,17 +43,14 @@
 //
 //M*/
 
-#include "opencv2/ocl/private/util.hpp"
+#include "precomp.hpp"
 
-#if defined (HAVE_OPENCL)
 #ifndef CL_VERSION_1_2
 #define CL_VERSION_1_2 0
 #endif
 
 using namespace std;
 
-
-
 namespace cv
 {
     namespace ocl
@@ -180,7 +177,7 @@ namespace cv
             texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
 #else
             texture = clCreateImage2D(
-                mat.clCxt->impl->clContext,
+                (cl_context)mat.clCxt->oclContext(),
                 CL_MEM_READ_WRITE,
                 &format,
                 mat.cols,
@@ -254,4 +251,3 @@ namespace cv
     }//namespace ocl
 
 }//namespace cv
-#endif
\ No newline at end of file
diff --git a/modules/ocl/src/safe_call.hpp b/modules/ocl/src/safe_call.hpp
index c8c19f6ed..441495f86 100644
--- a/modules/ocl/src/safe_call.hpp
+++ b/modules/ocl/src/safe_call.hpp
@@ -65,12 +65,6 @@ namespace cv
 {
     namespace ocl
     {
-        enum openCLMemcpyKind
-        {
-            clMemcpyHostToDevice = 0,
-            clMemcpyDeviceToHost,
-            clMemcpyDeviceToDevice
-        };
         void error( const char *error_string, const char *file, const int line, const char *func = "");
         const char *getOpenCLErrorString( int err );