Normalize line endings and whitespace

2012-10-17 11:12:04 +04:00
parent 0442bca235
commit 81f826db2b
1511 changed files with 258678 additions and 258624 deletions
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -1,70 +1,70 @@
-SET(OPENCV_GPU_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc opencv_highgui
-                                     opencv_ml opencv_video opencv_objdetect opencv_features2d
-                                     opencv_calib3d opencv_legacy opencv_contrib opencv_gpu
-                                     opencv_nonfree)
-
-ocv_check_dependencies(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
-
-if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
-  set(project "gpu")
-  string(TOUPPER "${project}" project_upper)
-
-  project("${project}_samples")
-
-  ocv_include_modules(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
-  ocv_include_directories(
-    "${OpenCV_SOURCE_DIR}/modules/gpu/src/nvidia"
-    "${OpenCV_SOURCE_DIR}/modules/gpu/src/nvidia/core"
-    )
-
-  if(HAVE_CUDA)
-    ocv_include_directories(${CUDA_INCLUDE_DIRS})
-  endif()
-
-  if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
-  endif()
-
-  # ---------------------------------------------
-  #      Define executable targets
-  # ---------------------------------------------
-  MACRO(OPENCV_DEFINE_GPU_EXAMPLE name srcs)
-    set(the_target "example_${project}_${name}")
-    add_executable(${the_target} ${srcs})
-
-    target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
-
-    set_target_properties(${the_target} PROPERTIES
-      OUTPUT_NAME "${name}_${project}"
-      PROJECT_LABEL "(EXAMPLE_${project_upper}) ${name}")
-
-    if(ENABLE_SOLUTION_FOLDERS)
-      set_target_properties(${the_target} PROPERTIES FOLDER "samples//${project}")
-    endif()
-
-    if(WIN32)
-      if(MSVC AND NOT BUILD_SHARED_LIBS)
-        set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
-      endif()
-      install(TARGETS ${the_target} RUNTIME DESTINATION "samples/${project}" COMPONENT main)
-    endif()
-  ENDMACRO()
-
-  file(GLOB all_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
-
-  foreach(sample_filename ${all_samples})
-    get_filename_component(sample ${sample_filename} NAME_WE)
-    file(GLOB sample_srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${sample}.*)
-    OPENCV_DEFINE_GPU_EXAMPLE(${sample} ${sample_srcs})
-  endforeach()
-
-  include("performance/CMakeLists.txt")
-endif()
-
-if (INSTALL_C_EXAMPLES AND NOT WIN32)
-  file(GLOB install_list *.c *.cpp *.jpg *.png *.data makefile.* build_all.sh *.dsp *.cmd )
-  install(FILES ${install_list}
-          DESTINATION share/OpenCV/samples/${project}
-          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
-endif()
-
+SET(OPENCV_GPU_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc opencv_highgui
+                                     opencv_ml opencv_video opencv_objdetect opencv_features2d
+                                     opencv_calib3d opencv_legacy opencv_contrib opencv_gpu
+                                     opencv_nonfree)
+
+ocv_check_dependencies(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
+
+if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
+  set(project "gpu")
+  string(TOUPPER "${project}" project_upper)
+
+  project("${project}_samples")
+
+  ocv_include_modules(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
+  ocv_include_directories(
+    "${OpenCV_SOURCE_DIR}/modules/gpu/src/nvidia"
+    "${OpenCV_SOURCE_DIR}/modules/gpu/src/nvidia/core"
+    )
+
+  if(HAVE_CUDA)
+    ocv_include_directories(${CUDA_INCLUDE_DIRS})
+  endif()
+
+  if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
+  endif()
+
+  # ---------------------------------------------
+  #      Define executable targets
+  # ---------------------------------------------
+  MACRO(OPENCV_DEFINE_GPU_EXAMPLE name srcs)
+    set(the_target "example_${project}_${name}")
+    add_executable(${the_target} ${srcs})
+
+    target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
+
+    set_target_properties(${the_target} PROPERTIES
+      OUTPUT_NAME "${name}_${project}"
+      PROJECT_LABEL "(EXAMPLE_${project_upper}) ${name}")
+
+    if(ENABLE_SOLUTION_FOLDERS)
+      set_target_properties(${the_target} PROPERTIES FOLDER "samples//${project}")
+    endif()
+
+    if(WIN32)
+      if(MSVC AND NOT BUILD_SHARED_LIBS)
+        set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
+      endif()
+      install(TARGETS ${the_target} RUNTIME DESTINATION "samples/${project}" COMPONENT main)
+    endif()
+  ENDMACRO()
+
+  file(GLOB all_samples RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp)
+
+  foreach(sample_filename ${all_samples})
+    get_filename_component(sample ${sample_filename} NAME_WE)
+    file(GLOB sample_srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${sample}.*)
+    OPENCV_DEFINE_GPU_EXAMPLE(${sample} ${sample_srcs})
+  endforeach()
+
+  include("performance/CMakeLists.txt")
+endif()
+
+if (INSTALL_C_EXAMPLES AND NOT WIN32)
+  file(GLOB install_list *.c *.cpp *.jpg *.png *.data makefile.* build_all.sh *.dsp *.cmd )
+  install(FILES ${install_list}
+          DESTINATION share/OpenCV/samples/${project}
+          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
+endif()
+
--- a/samples/gpu/alpha_comp.cpp
+++ b/samples/gpu/alpha_comp.cpp
@@ -1,68 +1,68 @@
-#include <iostream>
-
-#include "opencv2/core/opengl_interop.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-int main()
-{
-    cout << "This program demonstrates using alphaComp" << endl;
-    cout << "Press SPACE to change compositing operation" << endl;
-    cout << "Press ESC to exit" << endl;
-
-    namedWindow("First Image", WINDOW_NORMAL);
-    namedWindow("Second Image", WINDOW_NORMAL);
-    namedWindow("Result", WINDOW_OPENGL);
-
-    setGlDevice();
-
-    Mat src1(640, 480, CV_8UC4, Scalar::all(0));
-    Mat src2(640, 480, CV_8UC4, Scalar::all(0));
-
-    rectangle(src1, Rect(50, 50, 200, 200), Scalar(0, 0, 255, 128), 30);
-    rectangle(src2, Rect(100, 100, 200, 200), Scalar(255, 0, 0, 128), 30);
-
-    GpuMat d_src1(src1);
-    GpuMat d_src2(src2);
-
-    GpuMat d_res;
-
-    imshow("First Image", src1);
-    imshow("Second Image", src2);
-
-    int alpha_op = ALPHA_OVER;
-
-    const char* op_names[] = 
-    { 
-        "ALPHA_OVER", "ALPHA_IN", "ALPHA_OUT", "ALPHA_ATOP", "ALPHA_XOR", "ALPHA_PLUS", "ALPHA_OVER_PREMUL", "ALPHA_IN_PREMUL", "ALPHA_OUT_PREMUL",
-        "ALPHA_ATOP_PREMUL", "ALPHA_XOR_PREMUL", "ALPHA_PLUS_PREMUL", "ALPHA_PREMUL"
-    };
-
-    for(;;)
-    {
-        cout << op_names[alpha_op] << endl;
-
-        alphaComp(d_src1, d_src2, d_res, alpha_op);
-
-        imshow("Result", d_res);
-
-        char key = static_cast<char>(waitKey());
-
-        if (key == 27)
-            break;
-
-        if (key == 32)
-        {
-            ++alpha_op;
-
-            if (alpha_op > ALPHA_PREMUL)
-                alpha_op = ALPHA_OVER; 
-        }
-    }
-
-    return 0;
-}
+#include <iostream>
+
+#include "opencv2/core/opengl_interop.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+int main()
+{
+    cout << "This program demonstrates using alphaComp" << endl;
+    cout << "Press SPACE to change compositing operation" << endl;
+    cout << "Press ESC to exit" << endl;
+
+    namedWindow("First Image", WINDOW_NORMAL);
+    namedWindow("Second Image", WINDOW_NORMAL);
+    namedWindow("Result", WINDOW_OPENGL);
+
+    setGlDevice();
+
+    Mat src1(640, 480, CV_8UC4, Scalar::all(0));
+    Mat src2(640, 480, CV_8UC4, Scalar::all(0));
+
+    rectangle(src1, Rect(50, 50, 200, 200), Scalar(0, 0, 255, 128), 30);
+    rectangle(src2, Rect(100, 100, 200, 200), Scalar(255, 0, 0, 128), 30);
+
+    GpuMat d_src1(src1);
+    GpuMat d_src2(src2);
+
+    GpuMat d_res;
+
+    imshow("First Image", src1);
+    imshow("Second Image", src2);
+
+    int alpha_op = ALPHA_OVER;
+
+    const char* op_names[] =
+    {
+        "ALPHA_OVER", "ALPHA_IN", "ALPHA_OUT", "ALPHA_ATOP", "ALPHA_XOR", "ALPHA_PLUS", "ALPHA_OVER_PREMUL", "ALPHA_IN_PREMUL", "ALPHA_OUT_PREMUL",
+        "ALPHA_ATOP_PREMUL", "ALPHA_XOR_PREMUL", "ALPHA_PLUS_PREMUL", "ALPHA_PREMUL"
+    };
+
+    for(;;)
+    {
+        cout << op_names[alpha_op] << endl;
+
+        alphaComp(d_src1, d_src2, d_res, alpha_op);
+
+        imshow("Result", d_res);
+
+        char key = static_cast<char>(waitKey());
+
+        if (key == 27)
+            break;
+
+        if (key == 32)
+        {
+            ++alpha_op;
+
+            if (alpha_op > ALPHA_PREMUL)
+                alpha_op = ALPHA_OVER;
+        }
+    }
+
+    return 0;
+}
--- a/samples/gpu/brox_optical_flow.cpp
+++ b/samples/gpu/brox_optical_flow.cpp
@@ -1,312 +1,312 @@
-#include <iostream>
-#include <iomanip>
-#include <string>
-
-#include "cvconfig.h"
-#include "opencv2/core/core.hpp"
-#include "opencv2/core/opengl_interop.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-void getFlowField(const Mat& u, const Mat& v, Mat& flowField);
-
-#ifdef HAVE_OPENGL
-
-void needleMapDraw(void* userdata);
-
-#endif
-
-int main(int argc, const char* argv[])
-{
-    try
-    {
-        const char* keys =
-           "{ h   help      |       | print help message }"
-           "{ l   left      |       | specify left image }"
-           "{ r   right     |       | specify right image }"
-           "{ s   scale     | 0.8   | set pyramid scale factor }"
-           "{ a   alpha     | 0.197 | set alpha }"
-           "{ g   gamma     | 50.0  | set gamma }"
-           "{ i   inner     | 10    | set number of inner iterations }"
-           "{ o   outer     | 77    | set number of outer iterations }"
-           "{ si  solver    | 10    | set number of basic solver iterations }"
-           "{ t   time_step | 0.1   | set frame interpolation time step }";
-
-        CommandLineParser cmd(argc, argv, keys);
-
-        if (cmd.has("help") || !cmd.check())
-        {
-            cmd.printMessage();
-            cmd.printErrors();
-            return 0;
-        }
-
-        string frame0Name = cmd.get<string>("left");
-        string frame1Name = cmd.get<string>("right");
-        float scale = cmd.get<float>("scale");
-        float alpha = cmd.get<float>("alpha");
-        float gamma = cmd.get<float>("gamma");
-        int inner_iterations = cmd.get<int>("inner");
-        int outer_iterations = cmd.get<int>("outer");
-        int solver_iterations = cmd.get<int>("solver");
-        float timeStep = cmd.get<float>("time_step");
-
-        if (frame0Name.empty() || frame1Name.empty())
-        {
-            cerr << "Missing input file names" << endl;
-            return -1;
-        }
-
-        Mat frame0Color = imread(frame0Name);
-        Mat frame1Color = imread(frame1Name);
-
-        if (frame0Color.empty() || frame1Color.empty())
-        {
-            cout << "Can't load input images" << endl;
-            return -1;
-        }
-
-        cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-        cout << "OpenCV / NVIDIA Computer Vision" << endl;
-        cout << "Optical Flow Demo: Frame Interpolation" << endl;
-        cout << "=========================================" << endl;
-
-        namedWindow("Forward flow");
-        namedWindow("Backward flow");
-
-        namedWindow("Needle Map", WINDOW_OPENGL);
-
-        namedWindow("Interpolated frame");
-
-        setGlDevice();
-
-        cout << "Press:" << endl;
-        cout << "\tESC to quit" << endl;
-        cout << "\t'a' to move to the previous frame" << endl;
-        cout << "\t's' to move to the next frame\n" << endl;
-
-        frame0Color.convertTo(frame0Color, CV_32F, 1.0 / 255.0);
-        frame1Color.convertTo(frame1Color, CV_32F, 1.0 / 255.0);
-
-        Mat frame0Gray, frame1Gray;
-
-        cvtColor(frame0Color, frame0Gray, COLOR_BGR2GRAY);
-        cvtColor(frame1Color, frame1Gray, COLOR_BGR2GRAY);
-
-        GpuMat d_frame0(frame0Gray);
-        GpuMat d_frame1(frame1Gray);
-
-        cout << "Estimating optical flow" << endl;
-
-        BroxOpticalFlow d_flow(alpha, gamma, scale, inner_iterations, outer_iterations, solver_iterations);
-
-        cout << "\tForward..." << endl;
-
-        GpuMat d_fu, d_fv;
-
-        d_flow(d_frame0, d_frame1, d_fu, d_fv);
-
-        Mat flowFieldForward;
-        getFlowField(Mat(d_fu), Mat(d_fv), flowFieldForward);
-
-        cout << "\tBackward..." << endl;
-
-        GpuMat d_bu, d_bv;
-
-        d_flow(d_frame1, d_frame0, d_bu, d_bv);
-
-        Mat flowFieldBackward;
-        getFlowField(Mat(d_bu), Mat(d_bv), flowFieldBackward);
-
-#ifdef HAVE_OPENGL
-        cout << "Create Optical Flow Needle Map..." << endl;
-
-        GpuMat d_vertex, d_colors;
-
-        createOpticalFlowNeedleMap(d_fu, d_fv, d_vertex, d_colors);
-#endif
-
-        cout << "Interpolating..." << endl;
-
-        // first frame color components
-        GpuMat d_b, d_g, d_r;
-
-        // second frame color components
-        GpuMat d_bt, d_gt, d_rt;
-
-        // prepare color components on host and copy them to device memory
-        Mat channels[3];
-        cv::split(frame0Color, channels);
-
-        d_b.upload(channels[0]);
-        d_g.upload(channels[1]);
-        d_r.upload(channels[2]);
-
-        cv::split(frame1Color, channels);
-
-        d_bt.upload(channels[0]);
-        d_gt.upload(channels[1]);
-        d_rt.upload(channels[2]);
-
-        // temporary buffer
-        GpuMat d_buf;
-
-        // intermediate frame color components (GPU memory)
-        GpuMat d_rNew, d_gNew, d_bNew;
-
-        GpuMat d_newFrame;
-
-        vector<Mat> frames;
-        frames.reserve(static_cast<int>(1.0f / timeStep) + 2);
-
-        frames.push_back(frame0Color);
-
-        // compute interpolated frames
-        for (float timePos = timeStep; timePos < 1.0f; timePos += timeStep)
-        {
-            // interpolate blue channel
-            interpolateFrames(d_b, d_bt, d_fu, d_fv, d_bu, d_bv, timePos, d_bNew, d_buf);
-
-            // interpolate green channel
-            interpolateFrames(d_g, d_gt, d_fu, d_fv, d_bu, d_bv, timePos, d_gNew, d_buf);
-
-            // interpolate red channel
-            interpolateFrames(d_r, d_rt, d_fu, d_fv, d_bu, d_bv, timePos, d_rNew, d_buf);
-
-            GpuMat channels3[] = {d_bNew, d_gNew, d_rNew};
-            merge(channels3, 3, d_newFrame);
-
-            frames.push_back(Mat(d_newFrame));
-
-            cout << setprecision(4) << timePos * 100.0f << "%\r";
-        }
-
-        frames.push_back(frame1Color);
-
-        cout << setw(5) << "100%" << endl;
-
-        cout << "Done" << endl;
-
-        imshow("Forward flow", flowFieldForward);
-        imshow("Backward flow", flowFieldBackward);
-
-#ifdef HAVE_OPENGL
-        GlArrays arr;
-        arr.setVertexArray(d_vertex);
-        arr.setColorArray(d_colors, false);
-
-        setOpenGlDrawCallback("Needle Map", needleMapDraw, &arr);
-#endif
-
-        int currentFrame = 0;
-
-        imshow("Interpolated frame", frames[currentFrame]);
-
-        for(;;)
-        {
-            int key = toupper(waitKey(10) & 0xff);
-
-            switch (key)
-            {
-            case 27:
-                return 0;
-
-            case 'A':
-                if (currentFrame > 0)
-                    --currentFrame;
-
-                imshow("Interpolated frame", frames[currentFrame]);
-                break;
-
-            case 'S':
-                if (currentFrame < static_cast<int>(frames.size()) - 1)
-                    ++currentFrame;
-
-                imshow("Interpolated frame", frames[currentFrame]);
-                break;
-            }
-        }
-    }
-    catch (const exception& ex)
-    {
-        cerr << ex.what() << endl;
-        return -1;
-    }
-    catch (...)
-    {
-        cerr << "Unknow error" << endl;
-        return -1;
-    }
-}
-
-template <typename T> inline T clamp (T x, T a, T b)
-{
-    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
-}
-
-template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
-{
-    x = clamp(x, a, b);
-    return c + (d - c) * (x - a) / (b - a);
-}
-
-void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
-{
-    float maxDisplacement = 1.0f;
-
-    for (int i = 0; i < u.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-        for (int j = 0; j < u.cols; ++j)
-        {
-            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
-
-            if (d > maxDisplacement)
-                maxDisplacement = d;
-        }
-    }
-
-    flowField.create(u.size(), CV_8UC4);
-
-    for (int i = 0; i < flowField.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-
-        Vec4b* row = flowField.ptr<Vec4b>(i);
-
-        for (int j = 0; j < flowField.cols; ++j)
-        {
-            row[j][0] = 0;
-            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][3] = 255;
-        }
-    }
-}
-
-#ifdef HAVE_OPENGL
-
-void needleMapDraw(void* userdata)
-{
-    const GlArrays* arr = static_cast<const GlArrays*>(userdata);
-
-    GlCamera camera;
-    camera.setOrthoProjection(0.0, 1.0, 1.0, 0.0, 0.0, 1.0);
-    camera.lookAt(Point3d(0.0, 0.0, 1.0), Point3d(0.0, 0.0, 0.0), Point3d(0.0, 1.0, 0.0));
-
-    camera.setupProjectionMatrix();
-    camera.setupModelViewMatrix();
-
-    render(*arr, RenderMode::TRIANGLES);
-}
-
-#endif
+#include <iostream>
+#include <iomanip>
+#include <string>
+
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/opengl_interop.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+void getFlowField(const Mat& u, const Mat& v, Mat& flowField);
+
+#ifdef HAVE_OPENGL
+
+void needleMapDraw(void* userdata);
+
+#endif
+
+int main(int argc, const char* argv[])
+{
+    try
+    {
+        const char* keys =
+           "{ h   help      |       | print help message }"
+           "{ l   left      |       | specify left image }"
+           "{ r   right     |       | specify right image }"
+           "{ s   scale     | 0.8   | set pyramid scale factor }"
+           "{ a   alpha     | 0.197 | set alpha }"
+           "{ g   gamma     | 50.0  | set gamma }"
+           "{ i   inner     | 10    | set number of inner iterations }"
+           "{ o   outer     | 77    | set number of outer iterations }"
+           "{ si  solver    | 10    | set number of basic solver iterations }"
+           "{ t   time_step | 0.1   | set frame interpolation time step }";
+
+        CommandLineParser cmd(argc, argv, keys);
+
+        if (cmd.has("help") || !cmd.check())
+        {
+            cmd.printMessage();
+            cmd.printErrors();
+            return 0;
+        }
+
+        string frame0Name = cmd.get<string>("left");
+        string frame1Name = cmd.get<string>("right");
+        float scale = cmd.get<float>("scale");
+        float alpha = cmd.get<float>("alpha");
+        float gamma = cmd.get<float>("gamma");
+        int inner_iterations = cmd.get<int>("inner");
+        int outer_iterations = cmd.get<int>("outer");
+        int solver_iterations = cmd.get<int>("solver");
+        float timeStep = cmd.get<float>("time_step");
+
+        if (frame0Name.empty() || frame1Name.empty())
+        {
+            cerr << "Missing input file names" << endl;
+            return -1;
+        }
+
+        Mat frame0Color = imread(frame0Name);
+        Mat frame1Color = imread(frame1Name);
+
+        if (frame0Color.empty() || frame1Color.empty())
+        {
+            cout << "Can't load input images" << endl;
+            return -1;
+        }
+
+        cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
+
+        cout << "OpenCV / NVIDIA Computer Vision" << endl;
+        cout << "Optical Flow Demo: Frame Interpolation" << endl;
+        cout << "=========================================" << endl;
+
+        namedWindow("Forward flow");
+        namedWindow("Backward flow");
+
+        namedWindow("Needle Map", WINDOW_OPENGL);
+
+        namedWindow("Interpolated frame");
+
+        setGlDevice();
+
+        cout << "Press:" << endl;
+        cout << "\tESC to quit" << endl;
+        cout << "\t'a' to move to the previous frame" << endl;
+        cout << "\t's' to move to the next frame\n" << endl;
+
+        frame0Color.convertTo(frame0Color, CV_32F, 1.0 / 255.0);
+        frame1Color.convertTo(frame1Color, CV_32F, 1.0 / 255.0);
+
+        Mat frame0Gray, frame1Gray;
+
+        cvtColor(frame0Color, frame0Gray, COLOR_BGR2GRAY);
+        cvtColor(frame1Color, frame1Gray, COLOR_BGR2GRAY);
+
+        GpuMat d_frame0(frame0Gray);
+        GpuMat d_frame1(frame1Gray);
+
+        cout << "Estimating optical flow" << endl;
+
+        BroxOpticalFlow d_flow(alpha, gamma, scale, inner_iterations, outer_iterations, solver_iterations);
+
+        cout << "\tForward..." << endl;
+
+        GpuMat d_fu, d_fv;
+
+        d_flow(d_frame0, d_frame1, d_fu, d_fv);
+
+        Mat flowFieldForward;
+        getFlowField(Mat(d_fu), Mat(d_fv), flowFieldForward);
+
+        cout << "\tBackward..." << endl;
+
+        GpuMat d_bu, d_bv;
+
+        d_flow(d_frame1, d_frame0, d_bu, d_bv);
+
+        Mat flowFieldBackward;
+        getFlowField(Mat(d_bu), Mat(d_bv), flowFieldBackward);
+
+#ifdef HAVE_OPENGL
+        cout << "Create Optical Flow Needle Map..." << endl;
+
+        GpuMat d_vertex, d_colors;
+
+        createOpticalFlowNeedleMap(d_fu, d_fv, d_vertex, d_colors);
+#endif
+
+        cout << "Interpolating..." << endl;
+
+        // first frame color components
+        GpuMat d_b, d_g, d_r;
+
+        // second frame color components
+        GpuMat d_bt, d_gt, d_rt;
+
+        // prepare color components on host and copy them to device memory
+        Mat channels[3];
+        cv::split(frame0Color, channels);
+
+        d_b.upload(channels[0]);
+        d_g.upload(channels[1]);
+        d_r.upload(channels[2]);
+
+        cv::split(frame1Color, channels);
+
+        d_bt.upload(channels[0]);
+        d_gt.upload(channels[1]);
+        d_rt.upload(channels[2]);
+
+        // temporary buffer
+        GpuMat d_buf;
+
+        // intermediate frame color components (GPU memory)
+        GpuMat d_rNew, d_gNew, d_bNew;
+
+        GpuMat d_newFrame;
+
+        vector<Mat> frames;
+        frames.reserve(static_cast<int>(1.0f / timeStep) + 2);
+
+        frames.push_back(frame0Color);
+
+        // compute interpolated frames
+        for (float timePos = timeStep; timePos < 1.0f; timePos += timeStep)
+        {
+            // interpolate blue channel
+            interpolateFrames(d_b, d_bt, d_fu, d_fv, d_bu, d_bv, timePos, d_bNew, d_buf);
+
+            // interpolate green channel
+            interpolateFrames(d_g, d_gt, d_fu, d_fv, d_bu, d_bv, timePos, d_gNew, d_buf);
+
+            // interpolate red channel
+            interpolateFrames(d_r, d_rt, d_fu, d_fv, d_bu, d_bv, timePos, d_rNew, d_buf);
+
+            GpuMat channels3[] = {d_bNew, d_gNew, d_rNew};
+            merge(channels3, 3, d_newFrame);
+
+            frames.push_back(Mat(d_newFrame));
+
+            cout << setprecision(4) << timePos * 100.0f << "%\r";
+        }
+
+        frames.push_back(frame1Color);
+
+        cout << setw(5) << "100%" << endl;
+
+        cout << "Done" << endl;
+
+        imshow("Forward flow", flowFieldForward);
+        imshow("Backward flow", flowFieldBackward);
+
+#ifdef HAVE_OPENGL
+        GlArrays arr;
+        arr.setVertexArray(d_vertex);
+        arr.setColorArray(d_colors, false);
+
+        setOpenGlDrawCallback("Needle Map", needleMapDraw, &arr);
+#endif
+
+        int currentFrame = 0;
+
+        imshow("Interpolated frame", frames[currentFrame]);
+
+        for(;;)
+        {
+            int key = toupper(waitKey(10) & 0xff);
+
+            switch (key)
+            {
+            case 27:
+                return 0;
+
+            case 'A':
+                if (currentFrame > 0)
+                    --currentFrame;
+
+                imshow("Interpolated frame", frames[currentFrame]);
+                break;
+
+            case 'S':
+                if (currentFrame < static_cast<int>(frames.size()) - 1)
+                    ++currentFrame;
+
+                imshow("Interpolated frame", frames[currentFrame]);
+                break;
+            }
+        }
+    }
+    catch (const exception& ex)
+    {
+        cerr << ex.what() << endl;
+        return -1;
+    }
+    catch (...)
+    {
+        cerr << "Unknow error" << endl;
+        return -1;
+    }
+}
+
+template <typename T> inline T clamp (T x, T a, T b)
+{
+    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
+}
+
+template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
+{
+    x = clamp(x, a, b);
+    return c + (d - c) * (x - a) / (b - a);
+}
+
+void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
+{
+    float maxDisplacement = 1.0f;
+
+    for (int i = 0; i < u.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+        for (int j = 0; j < u.cols; ++j)
+        {
+            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
+
+            if (d > maxDisplacement)
+                maxDisplacement = d;
+        }
+    }
+
+    flowField.create(u.size(), CV_8UC4);
+
+    for (int i = 0; i < flowField.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+
+        Vec4b* row = flowField.ptr<Vec4b>(i);
+
+        for (int j = 0; j < flowField.cols; ++j)
+        {
+            row[j][0] = 0;
+            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][3] = 255;
+        }
+    }
+}
+
+#ifdef HAVE_OPENGL
+
+void needleMapDraw(void* userdata)
+{
+    const GlArrays* arr = static_cast<const GlArrays*>(userdata);
+
+    GlCamera camera;
+    camera.setOrthoProjection(0.0, 1.0, 1.0, 0.0, 0.0, 1.0);
+    camera.lookAt(Point3d(0.0, 0.0, 1.0), Point3d(0.0, 0.0, 0.0), Point3d(0.0, 1.0, 0.0));
+
+    camera.setupProjectionMatrix();
+    camera.setupModelViewMatrix();
+
+    render(*arr, RenderMode::TRIANGLES);
+}
+
+#endif
--- a/samples/gpu/cascadeclassifier.cpp
+++ b/samples/gpu/cascadeclassifier.cpp
@@ -1,308 +1,308 @@
-// WARNING: this sample is under construction! Use it on your own risk.
-#if defined _MSC_VER && _MSC_VER >= 1400
-#pragma warning(disable : 4100)
-#endif
-
-
-#include <iostream>
-#include <iomanip>
-#include "opencv2/contrib/contrib.hpp"
-#include "opencv2/objdetect/objdetect.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-
-static void help()
-{
-    cout << "Usage: ./cascadeclassifier_gpu \n\t--cascade <cascade_file>\n\t(<image>|--video <video>|--camera <camera_id>)\n"
-            "Using OpenCV version " << CV_VERSION << endl << endl;
-}
-
-
-template<class T>
-void convertAndResize(const T& src, T& gray, T& resized, double scale)
-{
-    if (src.channels() == 3)
-    {
-        cvtColor( src, gray, CV_BGR2GRAY );
-    }
-    else
-    {
-        gray = src;
-    }
-
-    Size sz(cvRound(gray.cols * scale), cvRound(gray.rows * scale));
-
-    if (scale != 1)
-    {
-        resize(gray, resized, sz);
-    }
-    else
-    {
-        resized = gray;
-    }
-}
-
-
-static void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
-{
-    int fontFace = FONT_HERSHEY_DUPLEX;
-    double fontScale = 0.8;
-    int fontThickness = 2;
-    Size fontSize = cv::getTextSize("T[]", fontFace, fontScale, fontThickness, 0);
-
-    Point org;
-    org.x = 1;
-    org.y = 3 * fontSize.height * (lineOffsY + 1) / 2;
-    putText(img, ss, org, fontFace, fontScale, CV_RGB(0,0,0), 5*fontThickness/2, 16);
-    putText(img, ss, org, fontFace, fontScale, fontColor, fontThickness, 16);
-}
-
-
-static void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
-{
-    Scalar fontColorRed = CV_RGB(255,0,0);
-    Scalar fontColorNV  = CV_RGB(118,185,0);
-
-    ostringstream ss;
-    ss << "FPS = " << setprecision(1) << fixed << fps;
-    matPrint(canvas, 0, fontColorRed, ss.str());
-    ss.str("");
-    ss << "[" << canvas.cols << "x" << canvas.rows << "], " <<
-        (bGpu ? "GPU, " : "CPU, ") <<
-        (bLargestFace ? "OneFace, " : "MultiFace, ") <<
-        (bFilter ? "Filter:ON" : "Filter:OFF");
-    matPrint(canvas, 1, fontColorRed, ss.str());
-
-    // by Anatoly. MacOS fix. ostringstream(const string&) is a private
-    // matPrint(canvas, 2, fontColorNV, ostringstream("Space - switch GPU / CPU"));
-    if (bHelp)
-    {
-        matPrint(canvas, 2, fontColorNV, "Space - switch GPU / CPU");
-        matPrint(canvas, 3, fontColorNV, "M - switch OneFace / MultiFace");
-        matPrint(canvas, 4, fontColorNV, "F - toggle rectangles Filter");
-        matPrint(canvas, 5, fontColorNV, "H - toggle hotkeys help");
-        matPrint(canvas, 6, fontColorNV, "1/Q - increase/decrease scale");
-    }
-    else
-    {
-        matPrint(canvas, 2, fontColorNV, "H - toggle hotkeys help");
-    }
-}
-
-
-int main(int argc, const char *argv[])
-{
-    if (argc == 1)
-    {
-        help();
-        return -1;
-    }
-
-    if (getCudaEnabledDeviceCount() == 0)
-    {
-        return cerr << "No GPU found or the library is compiled without GPU support" << endl, -1;
-    }
-
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    string cascadeName;
-    string inputName;
-    bool isInputImage = false;
-    bool isInputVideo = false;
-    bool isInputCamera = false;
-
-    for (int i = 1; i < argc; ++i)
-    {
-        if (string(argv[i]) == "--cascade")
-            cascadeName = argv[++i];
-        else if (string(argv[i]) == "--video")
-        {
-            inputName = argv[++i];
-            isInputVideo = true;
-        }
-        else if (string(argv[i]) == "--camera")
-        {
-            inputName = argv[++i];
-            isInputCamera = true;
-        }
-        else if (string(argv[i]) == "--help")
-        {
-            help();
-            return -1;
-        }
-        else if (!isInputImage)
-        {
-            inputName = argv[i];
-            isInputImage = true;
-        }
-        else
-        {
-            cout << "Unknown key: " << argv[i] << endl;
-            return -1;
-        }
-    }
-
-    CascadeClassifier_GPU cascade_gpu;
-    if (!cascade_gpu.load(cascadeName))
-    {
-        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
-    }
-
-    CascadeClassifier cascade_cpu;
-    if (!cascade_cpu.load(cascadeName))
-    {
-        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
-    }
-
-    VideoCapture capture;
-    Mat image;
-
-    if (isInputImage)
-    {
-        image = imread(inputName);
-        CV_Assert(!image.empty());
-    }
-    else if (isInputVideo)
-    {
-        capture.open(inputName);
-        CV_Assert(capture.isOpened());
-    }
-    else
-    {
-        capture.open(atoi(inputName.c_str()));
-        CV_Assert(capture.isOpened());
-    }
-
-    namedWindow("result", 1);
-
-    Mat frame, frame_cpu, gray_cpu, resized_cpu, faces_downloaded, frameDisp;
-    vector<Rect> facesBuf_cpu;
-
-    GpuMat frame_gpu, gray_gpu, resized_gpu, facesBuf_gpu;
-
-    /* parameters */
-    bool useGPU = true;
-    double scaleFactor = 1.0;
-    bool findLargestObject = false;
-    bool filterRects = true;
-    bool helpScreen = false;
-
-    int detections_num;
-    for (;;)
-    {
-        if (isInputCamera || isInputVideo)
-        {
-            capture >> frame;
-            if (frame.empty())
-            {
-                break;
-            }
-        }
-
-        (image.empty() ? frame : image).copyTo(frame_cpu);
-        frame_gpu.upload(image.empty() ? frame : image);
-
-        convertAndResize(frame_gpu, gray_gpu, resized_gpu, scaleFactor);
-        convertAndResize(frame_cpu, gray_cpu, resized_cpu, scaleFactor);
-
-        TickMeter tm;
-        tm.start();
-
-        if (useGPU)
-        {
-            cascade_gpu.visualizeInPlace = true;
-            cascade_gpu.findLargestObject = findLargestObject;
-
-            detections_num = cascade_gpu.detectMultiScale(resized_gpu, facesBuf_gpu, 1.2,
-                                                          (filterRects || findLargestObject) ? 4 : 0);
-            facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);
-        }
-        else
-        {
-            Size minSize = cascade_gpu.getClassifierSize();
-            cascade_cpu.detectMultiScale(resized_cpu, facesBuf_cpu, 1.2,
-                                         (filterRects || findLargestObject) ? 4 : 0,
-                                         (findLargestObject ? CV_HAAR_FIND_BIGGEST_OBJECT : 0)
-                                            | CV_HAAR_SCALE_IMAGE,
-                                         minSize);
-            detections_num = (int)facesBuf_cpu.size();
-        }
-
-        if (!useGPU && detections_num)
-        {
-            for (int i = 0; i < detections_num; ++i)
-            {
-                rectangle(resized_cpu, facesBuf_cpu[i], Scalar(255));
-            }
-        }
-
-        if (useGPU)
-        {
-            resized_gpu.download(resized_cpu);
-        }
-
-        tm.stop();
-        double detectionTime = tm.getTimeMilli();
-        double fps = 1000 / detectionTime;
-
-        //print detections to console
-        cout << setfill(' ') << setprecision(2);
-        cout << setw(6) << fixed << fps << " FPS, " << detections_num << " det";
-        if ((filterRects || findLargestObject) && detections_num > 0)
-        {
-            Rect *faceRects = useGPU ? faces_downloaded.ptr<Rect>() : &facesBuf_cpu[0];
-            for (int i = 0; i < min(detections_num, 2); ++i)
-            {
-                cout << ", [" << setw(4) << faceRects[i].x
-                     << ", " << setw(4) << faceRects[i].y
-                     << ", " << setw(4) << faceRects[i].width
-                     << ", " << setw(4) << faceRects[i].height << "]";
-            }
-        }
-        cout << endl;
-
-        cvtColor(resized_cpu, frameDisp, CV_GRAY2BGR);
-        displayState(frameDisp, helpScreen, useGPU, findLargestObject, filterRects, fps);
-        imshow("result", frameDisp);
-
-        char key = (char)waitKey(5);
-        if (key == 27)
-        {
-            break;
-        }
-
-        switch (key)
-        {
-        case ' ':
-            useGPU = !useGPU;
-            break;
-        case 'm':
-        case 'M':
-            findLargestObject = !findLargestObject;
-            break;
-        case 'f':
-        case 'F':
-            filterRects = !filterRects;
-            break;
-        case '1':
-            scaleFactor *= 1.05;
-            break;
-        case 'q':
-        case 'Q':
-            scaleFactor /= 1.05;
-            break;
-        case 'h':
-        case 'H':
-            helpScreen = !helpScreen;
-            break;
-        }
-    }
-
-    return 0;
-}
+// WARNING: this sample is under construction! Use it on your own risk.
+#if defined _MSC_VER && _MSC_VER >= 1400
+#pragma warning(disable : 4100)
+#endif
+
+
+#include <iostream>
+#include <iomanip>
+#include "opencv2/contrib/contrib.hpp"
+#include "opencv2/objdetect/objdetect.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+
+static void help()
+{
+    cout << "Usage: ./cascadeclassifier_gpu \n\t--cascade <cascade_file>\n\t(<image>|--video <video>|--camera <camera_id>)\n"
+            "Using OpenCV version " << CV_VERSION << endl << endl;
+}
+
+
+template<class T>
+void convertAndResize(const T& src, T& gray, T& resized, double scale)
+{
+    if (src.channels() == 3)
+    {
+        cvtColor( src, gray, CV_BGR2GRAY );
+    }
+    else
+    {
+        gray = src;
+    }
+
+    Size sz(cvRound(gray.cols * scale), cvRound(gray.rows * scale));
+
+    if (scale != 1)
+    {
+        resize(gray, resized, sz);
+    }
+    else
+    {
+        resized = gray;
+    }
+}
+
+
+static void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
+{
+    int fontFace = FONT_HERSHEY_DUPLEX;
+    double fontScale = 0.8;
+    int fontThickness = 2;
+    Size fontSize = cv::getTextSize("T[]", fontFace, fontScale, fontThickness, 0);
+
+    Point org;
+    org.x = 1;
+    org.y = 3 * fontSize.height * (lineOffsY + 1) / 2;
+    putText(img, ss, org, fontFace, fontScale, CV_RGB(0,0,0), 5*fontThickness/2, 16);
+    putText(img, ss, org, fontFace, fontScale, fontColor, fontThickness, 16);
+}
+
+
+static void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
+{
+    Scalar fontColorRed = CV_RGB(255,0,0);
+    Scalar fontColorNV  = CV_RGB(118,185,0);
+
+    ostringstream ss;
+    ss << "FPS = " << setprecision(1) << fixed << fps;
+    matPrint(canvas, 0, fontColorRed, ss.str());
+    ss.str("");
+    ss << "[" << canvas.cols << "x" << canvas.rows << "], " <<
+        (bGpu ? "GPU, " : "CPU, ") <<
+        (bLargestFace ? "OneFace, " : "MultiFace, ") <<
+        (bFilter ? "Filter:ON" : "Filter:OFF");
+    matPrint(canvas, 1, fontColorRed, ss.str());
+
+    // by Anatoly. MacOS fix. ostringstream(const string&) is a private
+    // matPrint(canvas, 2, fontColorNV, ostringstream("Space - switch GPU / CPU"));
+    if (bHelp)
+    {
+        matPrint(canvas, 2, fontColorNV, "Space - switch GPU / CPU");
+        matPrint(canvas, 3, fontColorNV, "M - switch OneFace / MultiFace");
+        matPrint(canvas, 4, fontColorNV, "F - toggle rectangles Filter");
+        matPrint(canvas, 5, fontColorNV, "H - toggle hotkeys help");
+        matPrint(canvas, 6, fontColorNV, "1/Q - increase/decrease scale");
+    }
+    else
+    {
+        matPrint(canvas, 2, fontColorNV, "H - toggle hotkeys help");
+    }
+}
+
+
+int main(int argc, const char *argv[])
+{
+    if (argc == 1)
+    {
+        help();
+        return -1;
+    }
+
+    if (getCudaEnabledDeviceCount() == 0)
+    {
+        return cerr << "No GPU found or the library is compiled without GPU support" << endl, -1;
+    }
+
+    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
+
+    string cascadeName;
+    string inputName;
+    bool isInputImage = false;
+    bool isInputVideo = false;
+    bool isInputCamera = false;
+
+    for (int i = 1; i < argc; ++i)
+    {
+        if (string(argv[i]) == "--cascade")
+            cascadeName = argv[++i];
+        else if (string(argv[i]) == "--video")
+        {
+            inputName = argv[++i];
+            isInputVideo = true;
+        }
+        else if (string(argv[i]) == "--camera")
+        {
+            inputName = argv[++i];
+            isInputCamera = true;
+        }
+        else if (string(argv[i]) == "--help")
+        {
+            help();
+            return -1;
+        }
+        else if (!isInputImage)
+        {
+            inputName = argv[i];
+            isInputImage = true;
+        }
+        else
+        {
+            cout << "Unknown key: " << argv[i] << endl;
+            return -1;
+        }
+    }
+
+    CascadeClassifier_GPU cascade_gpu;
+    if (!cascade_gpu.load(cascadeName))
+    {
+        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
+    }
+
+    CascadeClassifier cascade_cpu;
+    if (!cascade_cpu.load(cascadeName))
+    {
+        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
+    }
+
+    VideoCapture capture;
+    Mat image;
+
+    if (isInputImage)
+    {
+        image = imread(inputName);
+        CV_Assert(!image.empty());
+    }
+    else if (isInputVideo)
+    {
+        capture.open(inputName);
+        CV_Assert(capture.isOpened());
+    }
+    else
+    {
+        capture.open(atoi(inputName.c_str()));
+        CV_Assert(capture.isOpened());
+    }
+
+    namedWindow("result", 1);
+
+    Mat frame, frame_cpu, gray_cpu, resized_cpu, faces_downloaded, frameDisp;
+    vector<Rect> facesBuf_cpu;
+
+    GpuMat frame_gpu, gray_gpu, resized_gpu, facesBuf_gpu;
+
+    /* parameters */
+    bool useGPU = true;
+    double scaleFactor = 1.0;
+    bool findLargestObject = false;
+    bool filterRects = true;
+    bool helpScreen = false;
+
+    int detections_num;
+    for (;;)
+    {
+        if (isInputCamera || isInputVideo)
+        {
+            capture >> frame;
+            if (frame.empty())
+            {
+                break;
+            }
+        }
+
+        (image.empty() ? frame : image).copyTo(frame_cpu);
+        frame_gpu.upload(image.empty() ? frame : image);
+
+        convertAndResize(frame_gpu, gray_gpu, resized_gpu, scaleFactor);
+        convertAndResize(frame_cpu, gray_cpu, resized_cpu, scaleFactor);
+
+        TickMeter tm;
+        tm.start();
+
+        if (useGPU)
+        {
+            cascade_gpu.visualizeInPlace = true;
+            cascade_gpu.findLargestObject = findLargestObject;
+
+            detections_num = cascade_gpu.detectMultiScale(resized_gpu, facesBuf_gpu, 1.2,
+                                                          (filterRects || findLargestObject) ? 4 : 0);
+            facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);
+        }
+        else
+        {
+            Size minSize = cascade_gpu.getClassifierSize();
+            cascade_cpu.detectMultiScale(resized_cpu, facesBuf_cpu, 1.2,
+                                         (filterRects || findLargestObject) ? 4 : 0,
+                                         (findLargestObject ? CV_HAAR_FIND_BIGGEST_OBJECT : 0)
+                                            | CV_HAAR_SCALE_IMAGE,
+                                         minSize);
+            detections_num = (int)facesBuf_cpu.size();
+        }
+
+        if (!useGPU && detections_num)
+        {
+            for (int i = 0; i < detections_num; ++i)
+            {
+                rectangle(resized_cpu, facesBuf_cpu[i], Scalar(255));
+            }
+        }
+
+        if (useGPU)
+        {
+            resized_gpu.download(resized_cpu);
+        }
+
+        tm.stop();
+        double detectionTime = tm.getTimeMilli();
+        double fps = 1000 / detectionTime;
+
+        //print detections to console
+        cout << setfill(' ') << setprecision(2);
+        cout << setw(6) << fixed << fps << " FPS, " << detections_num << " det";
+        if ((filterRects || findLargestObject) && detections_num > 0)
+        {
+            Rect *faceRects = useGPU ? faces_downloaded.ptr<Rect>() : &facesBuf_cpu[0];
+            for (int i = 0; i < min(detections_num, 2); ++i)
+            {
+                cout << ", [" << setw(4) << faceRects[i].x
+                     << ", " << setw(4) << faceRects[i].y
+                     << ", " << setw(4) << faceRects[i].width
+                     << ", " << setw(4) << faceRects[i].height << "]";
+            }
+        }
+        cout << endl;
+
+        cvtColor(resized_cpu, frameDisp, CV_GRAY2BGR);
+        displayState(frameDisp, helpScreen, useGPU, findLargestObject, filterRects, fps);
+        imshow("result", frameDisp);
+
+        char key = (char)waitKey(5);
+        if (key == 27)
+        {
+            break;
+        }
+
+        switch (key)
+        {
+        case ' ':
+            useGPU = !useGPU;
+            break;
+        case 'm':
+        case 'M':
+            findLargestObject = !findLargestObject;
+            break;
+        case 'f':
+        case 'F':
+            filterRects = !filterRects;
+            break;
+        case '1':
+            scaleFactor *= 1.05;
+            break;
+        case 'q':
+        case 'Q':
+            scaleFactor /= 1.05;
+            break;
+        case 'h':
+        case 'H':
+            helpScreen = !helpScreen;
+            break;
+        }
+    }
+
+    return 0;
+}
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@@ -1,380 +1,380 @@
-#if defined _MSC_VER && _MSC_VER >= 1400
-#pragma warning( disable : 4201 4408 4127 4100)
-#endif
-
-#include "cvconfig.h"
-#include <iostream>
-#include <iomanip>
-#include <cstdio>
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/highgui/highgui.hpp"
-
-#ifdef HAVE_CUDA
-#include "NCVHaarObjectDetection.hpp"
-#endif
-
-using namespace std;
-using namespace cv;
-
-
-#if !defined(HAVE_CUDA)
-int main( int, const char** )
-{
-    cout << "Please compile the library with CUDA support" << endl;
-    return -1;
-}
-#else
-
-
-const Size2i preferredVideoFrameSize(640, 480);
-const string wndTitle = "NVIDIA Computer Vision :: Haar Classifiers Cascade";
-
-
-void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
-{
-    int fontFace = FONT_HERSHEY_DUPLEX;
-    double fontScale = 0.8;
-    int fontThickness = 2;
-    Size fontSize = cv::getTextSize("T[]", fontFace, fontScale, fontThickness, 0);
-
-    Point org;
-    org.x = 1;
-    org.y = 3 * fontSize.height * (lineOffsY + 1) / 2;
-    putText(img, ss, org, fontFace, fontScale, CV_RGB(0,0,0), 5*fontThickness/2, 16);
-    putText(img, ss, org, fontFace, fontScale, fontColor, fontThickness, 16);
-}
-
-
-void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
-{
-    Scalar fontColorRed = CV_RGB(255,0,0);
-    Scalar fontColorNV  = CV_RGB(118,185,0);
-
-    ostringstream ss;
-    ss << "FPS = " << setprecision(1) << fixed << fps;
-    matPrint(canvas, 0, fontColorRed, ss.str());
-    ss.str("");
-    ss << "[" << canvas.cols << "x" << canvas.rows << "], " <<
-        (bGpu ? "GPU, " : "CPU, ") <<
-        (bLargestFace ? "OneFace, " : "MultiFace, ") <<
-        (bFilter ? "Filter:ON" : "Filter:OFF");
-    matPrint(canvas, 1, fontColorRed, ss.str());
-
-    if (bHelp)
-    {
-        matPrint(canvas, 2, fontColorNV, "Space - switch GPU / CPU");
-        matPrint(canvas, 3, fontColorNV, "M - switch OneFace / MultiFace");
-        matPrint(canvas, 4, fontColorNV, "F - toggle rectangles Filter");
-        matPrint(canvas, 5, fontColorNV, "H - toggle hotkeys help");
-    }
-    else
-    {
-        matPrint(canvas, 2, fontColorNV, "H - toggle hotkeys help");
-    }
-}
-
-
-NCVStatus process(Mat *srcdst,
-                  Ncv32u width, Ncv32u height,
-                  NcvBool bFilterRects, NcvBool bLargestFace,
-                  HaarClassifierCascadeDescriptor &haar,
-                  NCVVector<HaarStage64> &d_haarStages, NCVVector<HaarClassifierNode128> &d_haarNodes,
-                  NCVVector<HaarFeature64> &d_haarFeatures, NCVVector<HaarStage64> &h_haarStages,
-                  INCVMemAllocator &gpuAllocator,
-                  INCVMemAllocator &cpuAllocator,
-                  cudaDeviceProp &devProp)
-{
-    ncvAssertReturn(!((srcdst == NULL) ^ gpuAllocator.isCounting()), NCV_NULL_PTR);
-
-    NCVStatus ncvStat;
-
-    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
-
-    NCVMatrixAlloc<Ncv8u> d_src(gpuAllocator, width, height);
-    ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-    NCVMatrixAlloc<Ncv8u> h_src(cpuAllocator, width, height);
-    ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-    NCVVectorAlloc<NcvRect32u> d_rects(gpuAllocator, 100);
-    ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-
-    NCV_SKIP_COND_BEGIN
-
-    for (Ncv32u i=0; i<(Ncv32u)srcdst->rows; i++)
-    {
-        memcpy(h_src.ptr() + i * h_src.stride(), srcdst->ptr(i), srcdst->cols);
-    }
-
-    ncvStat = h_src.copySolid(d_src, 0);
-    ncvAssertReturnNcvStat(ncvStat);
-    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-    NCV_SKIP_COND_END
-
-    NcvSize32u roi;
-    roi.width = d_src.width();
-    roi.height = d_src.height();
-
-    Ncv32u numDetections;
-    ncvStat = ncvDetectObjectsMultiScale_device(
-        d_src, roi, d_rects, numDetections, haar, h_haarStages,
-        d_haarStages, d_haarNodes, d_haarFeatures,
-        haar.ClassifierSize,
-        (bFilterRects || bLargestFace) ? 4 : 0,
-        1.2f, 1,
-        (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0)
-        | NCVPipeObjDet_VisualizeInPlace,
-        gpuAllocator, cpuAllocator, devProp, 0);
-    ncvAssertReturnNcvStat(ncvStat);
-    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-    NCV_SKIP_COND_BEGIN
-
-    ncvStat = d_src.copySolid(h_src, 0);
-    ncvAssertReturnNcvStat(ncvStat);
-    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-    for (Ncv32u i=0; i<(Ncv32u)srcdst->rows; i++)
-    {
-        memcpy(srcdst->ptr(i), h_src.ptr() + i * h_src.stride(), srcdst->cols);
-    }
-
-    NCV_SKIP_COND_END
-
-    return NCV_SUCCESS;
-}
-
-
-int main(int argc, const char** argv)
-{
-    cout << "OpenCV / NVIDIA Computer Vision" << endl;
-    cout << "Face Detection in video and live feed" << endl;
-    cout << "Syntax: exename <cascade_file> <image_or_video_or_cameraid>" << endl;
-    cout << "=========================================" << endl;
-
-    ncvAssertPrintReturn(cv::gpu::getCudaEnabledDeviceCount() != 0, "No GPU found or the library is compiled without GPU support", -1);
-    ncvAssertPrintReturn(argc == 3, "Invalid number of arguments", -1);
-
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    string cascadeName = argv[1];
-    string inputName = argv[2];
-
-    NCVStatus ncvStat;
-    NcvBool bQuit = false;
-    VideoCapture capture;
-    Size2i frameSize;
-
-    //open content source
-    Mat image = imread(inputName);
-    Mat frame;
-    if (!image.empty())
-    {
-        frameSize.width = image.cols;
-        frameSize.height = image.rows;
-    }
-    else
-    {
-        if (!capture.open(inputName))
-        {
-            int camid = -1;
-
-            istringstream ss(inputName);
-            int x = 0;
-            ss >> x;
-
-            ncvAssertPrintReturn(capture.open(camid) != 0, "Can't open source", -1);
-        }
-
-        capture >> frame;
-        ncvAssertPrintReturn(!frame.empty(), "Empty video source", -1);
-
-        frameSize.width = frame.cols;
-        frameSize.height = frame.rows;
-    }
-
-    NcvBool bUseGPU = true;
-    NcvBool bLargestObject = false;
-    NcvBool bFilterRects = true;
-    NcvBool bHelpScreen = false;
-
-    CascadeClassifier classifierOpenCV;
-    ncvAssertPrintReturn(classifierOpenCV.load(cascadeName) != 0, "Error (in OpenCV) opening classifier", -1);
-
-    int devId;
-    ncvAssertCUDAReturn(cudaGetDevice(&devId), -1);
-    cudaDeviceProp devProp;
-    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1);
-    cout << "Using GPU: " << devId << "(" << devProp.name <<
-            "), arch=" << devProp.major << "." << devProp.minor << endl;
-
-    //==============================================================================
-    //
-    // Load the classifier from file (assuming its size is about 1 mb)
-    // using a simple allocator
-    //
-    //==============================================================================
-
-    NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice, static_cast<Ncv32u>(devProp.textureAlignment));
-    ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1);
-    NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned, static_cast<Ncv32u>(devProp.textureAlignment));
-    ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1);
-
-    Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
-    ncvStat = ncvHaarGetClassifierSize(cascadeName, haarNumStages, haarNumNodes, haarNumFeatures);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", -1);
-
-    NCVVectorAlloc<HaarStage64> h_haarStages(cpuCascadeAllocator, haarNumStages);
-    ncvAssertPrintReturn(h_haarStages.isMemAllocated(), "Error in cascade CPU allocator", -1);
-    NCVVectorAlloc<HaarClassifierNode128> h_haarNodes(cpuCascadeAllocator, haarNumNodes);
-    ncvAssertPrintReturn(h_haarNodes.isMemAllocated(), "Error in cascade CPU allocator", -1);
-    NCVVectorAlloc<HaarFeature64> h_haarFeatures(cpuCascadeAllocator, haarNumFeatures);
-
-    ncvAssertPrintReturn(h_haarFeatures.isMemAllocated(), "Error in cascade CPU allocator", -1);
-
-    HaarClassifierCascadeDescriptor haar;
-    ncvStat = ncvHaarLoadFromFile_host(cascadeName, haar, h_haarStages, h_haarNodes, h_haarFeatures);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", -1);
-
-    NCVVectorAlloc<HaarStage64> d_haarStages(gpuCascadeAllocator, haarNumStages);
-    ncvAssertPrintReturn(d_haarStages.isMemAllocated(), "Error in cascade GPU allocator", -1);
-    NCVVectorAlloc<HaarClassifierNode128> d_haarNodes(gpuCascadeAllocator, haarNumNodes);
-    ncvAssertPrintReturn(d_haarNodes.isMemAllocated(), "Error in cascade GPU allocator", -1);
-    NCVVectorAlloc<HaarFeature64> d_haarFeatures(gpuCascadeAllocator, haarNumFeatures);
-    ncvAssertPrintReturn(d_haarFeatures.isMemAllocated(), "Error in cascade GPU allocator", -1);
-
-    ncvStat = h_haarStages.copySolid(d_haarStages, 0);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
-    ncvStat = h_haarNodes.copySolid(d_haarNodes, 0);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
-    ncvStat = h_haarFeatures.copySolid(d_haarFeatures, 0);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
-
-    //==============================================================================
-    //
-    // Calculate memory requirements and create real allocators
-    //
-    //==============================================================================
-
-    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
-    ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", -1);
-    NCVMemStackAllocator cpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
-    ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", -1);
-
-    ncvStat = process(NULL, frameSize.width, frameSize.height,
-                      false, false, haar,
-                      d_haarStages, d_haarNodes,
-                      d_haarFeatures, h_haarStages,
-                      gpuCounter, cpuCounter, devProp);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
-
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast<Ncv32u>(devProp.textureAlignment));
-    ncvAssertPrintReturn(gpuAllocator.isInitialized(), "Error creating GPU memory allocator", -1);
-    NCVMemStackAllocator cpuAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast<Ncv32u>(devProp.textureAlignment));
-    ncvAssertPrintReturn(cpuAllocator.isInitialized(), "Error creating CPU memory allocator", -1);
-
-    printf("Initialized for frame size [%dx%d]\n", frameSize.width, frameSize.height);
-
-    //==============================================================================
-    //
-    // Main processing loop
-    //
-    //==============================================================================
-
-    namedWindow(wndTitle, 1);
-    Mat gray, frameDisp;
-
-    do
-    {
-        Mat gray;
-        cvtColor((image.empty() ? frame : image), gray, CV_BGR2GRAY);
-
-        //
-        // process
-        //
-
-        NcvSize32u minSize = haar.ClassifierSize;
-        if (bLargestObject)
-        {
-            Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width;
-            Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height;
-            Ncv32u ratioSmallest = min(ratioX, ratioY);
-            ratioSmallest = max((Ncv32u)(ratioSmallest / 2.5f), (Ncv32u)1);
-            minSize.width *= ratioSmallest;
-            minSize.height *= ratioSmallest;
-        }
-
-        Ncv32f avgTime;
-        NcvTimer timer = ncvStartTimer();
-
-        if (bUseGPU)
-        {
-            ncvStat = process(&gray, frameSize.width, frameSize.height,
-                              bFilterRects, bLargestObject, haar,
-                              d_haarStages, d_haarNodes,
-                              d_haarFeatures, h_haarStages,
-                              gpuAllocator, cpuAllocator, devProp);
-            ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
-        }
-        else
-        {
-            vector<Rect> rectsOpenCV;
-
-            classifierOpenCV.detectMultiScale(
-                gray,
-                rectsOpenCV,
-                1.2f,
-                bFilterRects ? 4 : 0,
-                (bLargestObject ? CV_HAAR_FIND_BIGGEST_OBJECT : 0)
-                | CV_HAAR_SCALE_IMAGE,
-                Size(minSize.width, minSize.height));
-
-            for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt)
-                rectangle(gray, rectsOpenCV[rt], Scalar(255));
-        }
-
-        avgTime = (Ncv32f)ncvEndQueryTimerMs(timer);
-
-        cvtColor(gray, frameDisp, CV_GRAY2BGR);
-        displayState(frameDisp, bHelpScreen, bUseGPU, bLargestObject, bFilterRects, 1000.0f / avgTime);
-        imshow(wndTitle, frameDisp);
-
-        //handle input
-        switch (cvWaitKey(3))
-        {
-        case ' ':
-            bUseGPU = !bUseGPU;
-            break;
-        case 'm':
-        case 'M':
-            bLargestObject = !bLargestObject;
-            break;
-        case 'f':
-        case 'F':
-            bFilterRects = !bFilterRects;
-            break;
-        case 'h':
-        case 'H':
-            bHelpScreen = !bHelpScreen;
-            break;
-        case 27:
-            bQuit = true;
-            break;
-        }
-
-        // For camera and video file, capture the next image
-        if (capture.isOpened())
-        {
-            capture >> frame;
-            if (frame.empty())
-            {
-                break;
-            }
-        }
-    } while (!bQuit);
-
-    cvDestroyWindow(wndTitle.c_str());
-
-    return 0;
-}
-
-#endif //!defined(HAVE_CUDA)
+#if defined _MSC_VER && _MSC_VER >= 1400
+#pragma warning( disable : 4201 4408 4127 4100)
+#endif
+
+#include "cvconfig.h"
+#include <iostream>
+#include <iomanip>
+#include <cstdio>
+#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+#ifdef HAVE_CUDA
+#include "NCVHaarObjectDetection.hpp"
+#endif
+
+using namespace std;
+using namespace cv;
+
+
+#if !defined(HAVE_CUDA)
+int main( int, const char** )
+{
+    cout << "Please compile the library with CUDA support" << endl;
+    return -1;
+}
+#else
+
+
+const Size2i preferredVideoFrameSize(640, 480);
+const string wndTitle = "NVIDIA Computer Vision :: Haar Classifiers Cascade";
+
+
+void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
+{
+    int fontFace = FONT_HERSHEY_DUPLEX;
+    double fontScale = 0.8;
+    int fontThickness = 2;
+    Size fontSize = cv::getTextSize("T[]", fontFace, fontScale, fontThickness, 0);
+
+    Point org;
+    org.x = 1;
+    org.y = 3 * fontSize.height * (lineOffsY + 1) / 2;
+    putText(img, ss, org, fontFace, fontScale, CV_RGB(0,0,0), 5*fontThickness/2, 16);
+    putText(img, ss, org, fontFace, fontScale, fontColor, fontThickness, 16);
+}
+
+
+void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
+{
+    Scalar fontColorRed = CV_RGB(255,0,0);
+    Scalar fontColorNV  = CV_RGB(118,185,0);
+
+    ostringstream ss;
+    ss << "FPS = " << setprecision(1) << fixed << fps;
+    matPrint(canvas, 0, fontColorRed, ss.str());
+    ss.str("");
+    ss << "[" << canvas.cols << "x" << canvas.rows << "], " <<
+        (bGpu ? "GPU, " : "CPU, ") <<
+        (bLargestFace ? "OneFace, " : "MultiFace, ") <<
+        (bFilter ? "Filter:ON" : "Filter:OFF");
+    matPrint(canvas, 1, fontColorRed, ss.str());
+
+    if (bHelp)
+    {
+        matPrint(canvas, 2, fontColorNV, "Space - switch GPU / CPU");
+        matPrint(canvas, 3, fontColorNV, "M - switch OneFace / MultiFace");
+        matPrint(canvas, 4, fontColorNV, "F - toggle rectangles Filter");
+        matPrint(canvas, 5, fontColorNV, "H - toggle hotkeys help");
+    }
+    else
+    {
+        matPrint(canvas, 2, fontColorNV, "H - toggle hotkeys help");
+    }
+}
+
+
+NCVStatus process(Mat *srcdst,
+                  Ncv32u width, Ncv32u height,
+                  NcvBool bFilterRects, NcvBool bLargestFace,
+                  HaarClassifierCascadeDescriptor &haar,
+                  NCVVector<HaarStage64> &d_haarStages, NCVVector<HaarClassifierNode128> &d_haarNodes,
+                  NCVVector<HaarFeature64> &d_haarFeatures, NCVVector<HaarStage64> &h_haarStages,
+                  INCVMemAllocator &gpuAllocator,
+                  INCVMemAllocator &cpuAllocator,
+                  cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(!((srcdst == NULL) ^ gpuAllocator.isCounting()), NCV_NULL_PTR);
+
+    NCVStatus ncvStat;
+
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+
+    NCVMatrixAlloc<Ncv8u> d_src(gpuAllocator, width, height);
+    ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv8u> h_src(cpuAllocator, width, height);
+    ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVVectorAlloc<NcvRect32u> d_rects(gpuAllocator, 100);
+    ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    NCV_SKIP_COND_BEGIN
+
+    for (Ncv32u i=0; i<(Ncv32u)srcdst->rows; i++)
+    {
+        memcpy(h_src.ptr() + i * h_src.stride(), srcdst->ptr(i), srcdst->cols);
+    }
+
+    ncvStat = h_src.copySolid(d_src, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    NCV_SKIP_COND_END
+
+    NcvSize32u roi;
+    roi.width = d_src.width();
+    roi.height = d_src.height();
+
+    Ncv32u numDetections;
+    ncvStat = ncvDetectObjectsMultiScale_device(
+        d_src, roi, d_rects, numDetections, haar, h_haarStages,
+        d_haarStages, d_haarNodes, d_haarFeatures,
+        haar.ClassifierSize,
+        (bFilterRects || bLargestFace) ? 4 : 0,
+        1.2f, 1,
+        (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0)
+        | NCVPipeObjDet_VisualizeInPlace,
+        gpuAllocator, cpuAllocator, devProp, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = d_src.copySolid(h_src, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    for (Ncv32u i=0; i<(Ncv32u)srcdst->rows; i++)
+    {
+        memcpy(srcdst->ptr(i), h_src.ptr() + i * h_src.stride(), srcdst->cols);
+    }
+
+    NCV_SKIP_COND_END
+
+    return NCV_SUCCESS;
+}
+
+
+int main(int argc, const char** argv)
+{
+    cout << "OpenCV / NVIDIA Computer Vision" << endl;
+    cout << "Face Detection in video and live feed" << endl;
+    cout << "Syntax: exename <cascade_file> <image_or_video_or_cameraid>" << endl;
+    cout << "=========================================" << endl;
+
+    ncvAssertPrintReturn(cv::gpu::getCudaEnabledDeviceCount() != 0, "No GPU found or the library is compiled without GPU support", -1);
+    ncvAssertPrintReturn(argc == 3, "Invalid number of arguments", -1);
+
+    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
+
+    string cascadeName = argv[1];
+    string inputName = argv[2];
+
+    NCVStatus ncvStat;
+    NcvBool bQuit = false;
+    VideoCapture capture;
+    Size2i frameSize;
+
+    //open content source
+    Mat image = imread(inputName);
+    Mat frame;
+    if (!image.empty())
+    {
+        frameSize.width = image.cols;
+        frameSize.height = image.rows;
+    }
+    else
+    {
+        if (!capture.open(inputName))
+        {
+            int camid = -1;
+
+            istringstream ss(inputName);
+            int x = 0;
+            ss >> x;
+
+            ncvAssertPrintReturn(capture.open(camid) != 0, "Can't open source", -1);
+        }
+
+        capture >> frame;
+        ncvAssertPrintReturn(!frame.empty(), "Empty video source", -1);
+
+        frameSize.width = frame.cols;
+        frameSize.height = frame.rows;
+    }
+
+    NcvBool bUseGPU = true;
+    NcvBool bLargestObject = false;
+    NcvBool bFilterRects = true;
+    NcvBool bHelpScreen = false;
+
+    CascadeClassifier classifierOpenCV;
+    ncvAssertPrintReturn(classifierOpenCV.load(cascadeName) != 0, "Error (in OpenCV) opening classifier", -1);
+
+    int devId;
+    ncvAssertCUDAReturn(cudaGetDevice(&devId), -1);
+    cudaDeviceProp devProp;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1);
+    cout << "Using GPU: " << devId << "(" << devProp.name <<
+            "), arch=" << devProp.major << "." << devProp.minor << endl;
+
+    //==============================================================================
+    //
+    // Load the classifier from file (assuming its size is about 1 mb)
+    // using a simple allocator
+    //
+    //==============================================================================
+
+    NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice, static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1);
+    NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned, static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1);
+
+    Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
+    ncvStat = ncvHaarGetClassifierSize(cascadeName, haarNumStages, haarNumNodes, haarNumFeatures);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", -1);
+
+    NCVVectorAlloc<HaarStage64> h_haarStages(cpuCascadeAllocator, haarNumStages);
+    ncvAssertPrintReturn(h_haarStages.isMemAllocated(), "Error in cascade CPU allocator", -1);
+    NCVVectorAlloc<HaarClassifierNode128> h_haarNodes(cpuCascadeAllocator, haarNumNodes);
+    ncvAssertPrintReturn(h_haarNodes.isMemAllocated(), "Error in cascade CPU allocator", -1);
+    NCVVectorAlloc<HaarFeature64> h_haarFeatures(cpuCascadeAllocator, haarNumFeatures);
+
+    ncvAssertPrintReturn(h_haarFeatures.isMemAllocated(), "Error in cascade CPU allocator", -1);
+
+    HaarClassifierCascadeDescriptor haar;
+    ncvStat = ncvHaarLoadFromFile_host(cascadeName, haar, h_haarStages, h_haarNodes, h_haarFeatures);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", -1);
+
+    NCVVectorAlloc<HaarStage64> d_haarStages(gpuCascadeAllocator, haarNumStages);
+    ncvAssertPrintReturn(d_haarStages.isMemAllocated(), "Error in cascade GPU allocator", -1);
+    NCVVectorAlloc<HaarClassifierNode128> d_haarNodes(gpuCascadeAllocator, haarNumNodes);
+    ncvAssertPrintReturn(d_haarNodes.isMemAllocated(), "Error in cascade GPU allocator", -1);
+    NCVVectorAlloc<HaarFeature64> d_haarFeatures(gpuCascadeAllocator, haarNumFeatures);
+    ncvAssertPrintReturn(d_haarFeatures.isMemAllocated(), "Error in cascade GPU allocator", -1);
+
+    ncvStat = h_haarStages.copySolid(d_haarStages, 0);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
+    ncvStat = h_haarNodes.copySolid(d_haarNodes, 0);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
+    ncvStat = h_haarFeatures.copySolid(d_haarFeatures, 0);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
+
+    //==============================================================================
+    //
+    // Calculate memory requirements and create real allocators
+    //
+    //==============================================================================
+
+    NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", -1);
+    NCVMemStackAllocator cpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", -1);
+
+    ncvStat = process(NULL, frameSize.width, frameSize.height,
+                      false, false, haar,
+                      d_haarStages, d_haarNodes,
+                      d_haarFeatures, h_haarStages,
+                      gpuCounter, cpuCounter, devProp);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
+
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertPrintReturn(gpuAllocator.isInitialized(), "Error creating GPU memory allocator", -1);
+    NCVMemStackAllocator cpuAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast<Ncv32u>(devProp.textureAlignment));
+    ncvAssertPrintReturn(cpuAllocator.isInitialized(), "Error creating CPU memory allocator", -1);
+
+    printf("Initialized for frame size [%dx%d]\n", frameSize.width, frameSize.height);
+
+    //==============================================================================
+    //
+    // Main processing loop
+    //
+    //==============================================================================
+
+    namedWindow(wndTitle, 1);
+    Mat gray, frameDisp;
+
+    do
+    {
+        Mat gray;
+        cvtColor((image.empty() ? frame : image), gray, CV_BGR2GRAY);
+
+        //
+        // process
+        //
+
+        NcvSize32u minSize = haar.ClassifierSize;
+        if (bLargestObject)
+        {
+            Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width;
+            Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height;
+            Ncv32u ratioSmallest = min(ratioX, ratioY);
+            ratioSmallest = max((Ncv32u)(ratioSmallest / 2.5f), (Ncv32u)1);
+            minSize.width *= ratioSmallest;
+            minSize.height *= ratioSmallest;
+        }
+
+        Ncv32f avgTime;
+        NcvTimer timer = ncvStartTimer();
+
+        if (bUseGPU)
+        {
+            ncvStat = process(&gray, frameSize.width, frameSize.height,
+                              bFilterRects, bLargestObject, haar,
+                              d_haarStages, d_haarNodes,
+                              d_haarFeatures, h_haarStages,
+                              gpuAllocator, cpuAllocator, devProp);
+            ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
+        }
+        else
+        {
+            vector<Rect> rectsOpenCV;
+
+            classifierOpenCV.detectMultiScale(
+                gray,
+                rectsOpenCV,
+                1.2f,
+                bFilterRects ? 4 : 0,
+                (bLargestObject ? CV_HAAR_FIND_BIGGEST_OBJECT : 0)
+                | CV_HAAR_SCALE_IMAGE,
+                Size(minSize.width, minSize.height));
+
+            for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt)
+                rectangle(gray, rectsOpenCV[rt], Scalar(255));
+        }
+
+        avgTime = (Ncv32f)ncvEndQueryTimerMs(timer);
+
+        cvtColor(gray, frameDisp, CV_GRAY2BGR);
+        displayState(frameDisp, bHelpScreen, bUseGPU, bLargestObject, bFilterRects, 1000.0f / avgTime);
+        imshow(wndTitle, frameDisp);
+
+        //handle input
+        switch (cvWaitKey(3))
+        {
+        case ' ':
+            bUseGPU = !bUseGPU;
+            break;
+        case 'm':
+        case 'M':
+            bLargestObject = !bLargestObject;
+            break;
+        case 'f':
+        case 'F':
+            bFilterRects = !bFilterRects;
+            break;
+        case 'h':
+        case 'H':
+            bHelpScreen = !bHelpScreen;
+            break;
+        case 27:
+            bQuit = true;
+            break;
+        }
+
+        // For camera and video file, capture the next image
+        if (capture.isOpened())
+        {
+            capture >> frame;
+            if (frame.empty())
+            {
+                break;
+            }
+        }
+    } while (!bQuit);
+
+    cvDestroyWindow(wndTitle.c_str());
+
+    return 0;
+}
+
+#endif //!defined(HAVE_CUDA)
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@@ -1,152 +1,152 @@
-/* This sample demonstrates the way you can perform independed tasks 
-   on the different GPUs */
-
-// Disable some warnings which are caused with CUDA headers
-#if defined(_MSC_VER)
-#pragma warning(disable: 4201 4408 4100)
-#endif
-
-#include <iostream>
-#include "cvconfig.h"
-#include "opencv2/core/core.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
-
-int main()
-{
-#if !defined(HAVE_CUDA)
-    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
-#endif
-
-#if !defined(HAVE_TBB)
-    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
-#endif
-
-    return 0;
-}
-
-#else
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "opencv2/core/internal.hpp" // For TBB wrappers
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-struct Worker { void operator()(int device_id) const; };
-void destroyContexts();
-
-#define safeCall(expr) safeCall_(expr, #expr, __FILE__, __LINE__)
-inline void safeCall_(int code, const char* expr, const char* file, int line)
-{
-    if (code != CUDA_SUCCESS)
-    {
-        std::cout << "CUDA driver API error: code " << code << ", expr " << expr
-            << ", file " << file << ", line " << line << endl;
-        destroyContexts();
-        exit(-1);
-    }
-}
-
-// Each GPU is associated with its own context
-CUcontext contexts[2];
-
-int main(int argc, char **argv)
-{
-    if (argc > 1)
-    {
-        cout << "CUDA driver API sample\n";
-        return -1;
-    }
-
-    int num_devices = getCudaEnabledDeviceCount();
-    if (num_devices < 2)
-    {
-        std::cout << "Two or more GPUs are required\n";
-        return -1;
-    }
-
-    for (int i = 0; i < num_devices; ++i)
-    {
-        cv::gpu::printShortCudaDeviceInfo(i);
-
-        DeviceInfo dev_info(i);
-        if (!dev_info.isCompatible())
-        {
-            std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.majorVersion()
-                 << dev_info.minorVersion() << "\n";
-            return -1;
-        }
-    }
-
-    // Init CUDA Driver API
-    safeCall(cuInit(0));
-
-    // Create context for GPU #0
-    CUdevice device;
-    safeCall(cuDeviceGet(&device, 0));
-    safeCall(cuCtxCreate(&contexts[0], 0, device));
-
-    CUcontext prev_context;
-    safeCall(cuCtxPopCurrent(&prev_context));
-
-    // Create context for GPU #1
-    safeCall(cuDeviceGet(&device, 1));
-    safeCall(cuCtxCreate(&contexts[1], 0, device));
-
-    safeCall(cuCtxPopCurrent(&prev_context));
-
-    // Execute calculation in two threads using two GPUs
-    int devices[] = {0, 1};
-    parallel_do(devices, devices + 2, Worker());
-
-    destroyContexts();
-    return 0;
-}
-
-
-void Worker::operator()(int device_id) const
-{
-    // Set the proper context
-    safeCall(cuCtxPushCurrent(contexts[device_id]));
-
-    Mat src(1000, 1000, CV_32F);
-    Mat dst;
-
-    RNG rng(0);
-    rng.fill(src, RNG::UNIFORM, 0, 1);
-
-    // CPU works
-    transpose(src, dst);
-
-    // GPU works
-    GpuMat d_src(src);
-    GpuMat d_dst;
-    transpose(d_src, d_dst);
-
-    // Check results
-    bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
-    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
-        << (passed ? "passed" : "FAILED") << endl;
-
-    // Deallocate data here, otherwise deallocation will be performed
-    // after context is extracted from the stack
-    d_src.release();
-    d_dst.release();
-
-    CUcontext prev_context;
-    safeCall(cuCtxPopCurrent(&prev_context));
-}
-
-
-void destroyContexts()
-{
-    safeCall(cuCtxDestroy(contexts[0]));
-    safeCall(cuCtxDestroy(contexts[1]));
-}
-
-#endif
+/* This sample demonstrates the way you can perform independed tasks
+   on the different GPUs */
+
+// Disable some warnings which are caused with CUDA headers
+#if defined(_MSC_VER)
+#pragma warning(disable: 4201 4408 4100)
+#endif
+
+#include <iostream>
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+
+int main()
+{
+#if !defined(HAVE_CUDA)
+    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
+#endif
+
+#if !defined(HAVE_TBB)
+    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
+#endif
+
+    return 0;
+}
+
+#else
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "opencv2/core/internal.hpp" // For TBB wrappers
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+struct Worker { void operator()(int device_id) const; };
+void destroyContexts();
+
+#define safeCall(expr) safeCall_(expr, #expr, __FILE__, __LINE__)
+inline void safeCall_(int code, const char* expr, const char* file, int line)
+{
+    if (code != CUDA_SUCCESS)
+    {
+        std::cout << "CUDA driver API error: code " << code << ", expr " << expr
+            << ", file " << file << ", line " << line << endl;
+        destroyContexts();
+        exit(-1);
+    }
+}
+
+// Each GPU is associated with its own context
+CUcontext contexts[2];
+
+int main(int argc, char **argv)
+{
+    if (argc > 1)
+    {
+        cout << "CUDA driver API sample\n";
+        return -1;
+    }
+
+    int num_devices = getCudaEnabledDeviceCount();
+    if (num_devices < 2)
+    {
+        std::cout << "Two or more GPUs are required\n";
+        return -1;
+    }
+
+    for (int i = 0; i < num_devices; ++i)
+    {
+        cv::gpu::printShortCudaDeviceInfo(i);
+
+        DeviceInfo dev_info(i);
+        if (!dev_info.isCompatible())
+        {
+            std::cout << "GPU module isn't built for GPU #" << i << " ("
+                 << dev_info.name() << ", CC " << dev_info.majorVersion()
+                 << dev_info.minorVersion() << "\n";
+            return -1;
+        }
+    }
+
+    // Init CUDA Driver API
+    safeCall(cuInit(0));
+
+    // Create context for GPU #0
+    CUdevice device;
+    safeCall(cuDeviceGet(&device, 0));
+    safeCall(cuCtxCreate(&contexts[0], 0, device));
+
+    CUcontext prev_context;
+    safeCall(cuCtxPopCurrent(&prev_context));
+
+    // Create context for GPU #1
+    safeCall(cuDeviceGet(&device, 1));
+    safeCall(cuCtxCreate(&contexts[1], 0, device));
+
+    safeCall(cuCtxPopCurrent(&prev_context));
+
+    // Execute calculation in two threads using two GPUs
+    int devices[] = {0, 1};
+    parallel_do(devices, devices + 2, Worker());
+
+    destroyContexts();
+    return 0;
+}
+
+
+void Worker::operator()(int device_id) const
+{
+    // Set the proper context
+    safeCall(cuCtxPushCurrent(contexts[device_id]));
+
+    Mat src(1000, 1000, CV_32F);
+    Mat dst;
+
+    RNG rng(0);
+    rng.fill(src, RNG::UNIFORM, 0, 1);
+
+    // CPU works
+    transpose(src, dst);
+
+    // GPU works
+    GpuMat d_src(src);
+    GpuMat d_dst;
+    transpose(d_src, d_dst);
+
+    // Check results
+    bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
+    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
+        << (passed ? "passed" : "FAILED") << endl;
+
+    // Deallocate data here, otherwise deallocation will be performed
+    // after context is extracted from the stack
+    d_src.release();
+    d_dst.release();
+
+    CUcontext prev_context;
+    safeCall(cuCtxPopCurrent(&prev_context));
+}
+
+
+void destroyContexts()
+{
+    safeCall(cuCtxDestroy(contexts[0]));
+    safeCall(cuCtxDestroy(contexts[1]));
+}
+
+#endif
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
@@ -1,211 +1,211 @@
-/* This sample demonstrates working on one piece of data using two GPUs.
-   It splits input into two parts and processes them separately on different
-   GPUs. */
-
-// Disable some warnings which are caused with CUDA headers
-#if defined(_MSC_VER)
-#pragma warning(disable: 4201 4408 4100)
-#endif
-
-#include <iostream>
-#include "cvconfig.h"
-#include "opencv2/core/core.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
-
-int main()
-{
-#if !defined(HAVE_CUDA)
-    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
-#endif
-
-#if !defined(HAVE_TBB)
-    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
-#endif
-
-    return 0;
-}
-
-#else
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "opencv2/core/internal.hpp" // For TBB wrappers
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-struct Worker { void operator()(int device_id) const; };
-void destroyContexts();
-
-#define safeCall(expr) safeCall_(expr, #expr, __FILE__, __LINE__)
-inline void safeCall_(int code, const char* expr, const char* file, int line)
-{
-    if (code != CUDA_SUCCESS)
-    {
-        std::cout << "CUDA driver API error: code " << code << ", expr " << expr
-            << ", file " << file << ", line " << line << endl;
-        destroyContexts();
-        exit(-1);
-    }
-}
-
-// Each GPU is associated with its own context
-CUcontext contexts[2];
-
-void inline contextOn(int id)
-{
-    safeCall(cuCtxPushCurrent(contexts[id]));
-}
-
-void inline contextOff()
-{
-    CUcontext prev_context;
-    safeCall(cuCtxPopCurrent(&prev_context));
-}
-
-// GPUs data
-GpuMat d_left[2];
-GpuMat d_right[2];
-StereoBM_GPU* bm[2];
-GpuMat d_result[2];
-
-// CPU result
-Mat result;
-
-void printHelp()
-{
-    std::cout << "Usage: driver_api_stereo_multi_gpu --left <left_image> --right <right_image>\n";
-}
-
-int main(int argc, char** argv)
-{
-    if (argc < 5)
-    {
-        printHelp();        
-        return -1;
-    }
-
-    int num_devices = getCudaEnabledDeviceCount();
-    if (num_devices < 2)
-    {
-        std::cout << "Two or more GPUs are required\n";
-        return -1;
-    }
-
-    for (int i = 0; i < num_devices; ++i)
-    {
-        cv::gpu::printShortCudaDeviceInfo(i);
-
-        DeviceInfo dev_info(i);
-        if (!dev_info.isCompatible())
-        {
-            std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.majorVersion()
-                 << dev_info.minorVersion() << "\n";
-            return -1;
-        }
-    }
-
-    // Load input data
-    Mat left, right;
-    for (int i = 1; i < argc; ++i)
-    {
-        if (string(argv[i]) == "--left")
-        {
-            left = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE);
-            CV_Assert(!left.empty());
-        }
-        else if (string(argv[i]) == "--right")
-        {
-            right = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE);
-            CV_Assert(!right.empty());
-        }
-        else if (string(argv[i]) == "--help")
-        {
-            printHelp();
-            return -1;
-        }
-    }
-
-
-    // Init CUDA Driver API
-    safeCall(cuInit(0));
-
-    // Create context for GPU #0
-    CUdevice device;
-    safeCall(cuDeviceGet(&device, 0));
-    safeCall(cuCtxCreate(&contexts[0], 0, device));
-    contextOff();
-
-    // Create context for GPU #1
-    safeCall(cuDeviceGet(&device, 1));
-    safeCall(cuCtxCreate(&contexts[1], 0, device));
-    contextOff();
-
-    // Split source images for processing on GPU #0
-    contextOn(0);
-    d_left[0].upload(left.rowRange(0, left.rows / 2));
-    d_right[0].upload(right.rowRange(0, right.rows / 2));
-    bm[0] = new StereoBM_GPU();
-    contextOff();
-
-    // Split source images for processing on the GPU #1
-    contextOn(1);
-    d_left[1].upload(left.rowRange(left.rows / 2, left.rows));
-    d_right[1].upload(right.rowRange(right.rows / 2, right.rows));
-    bm[1] = new StereoBM_GPU();
-    contextOff();
-
-    // Execute calculation in two threads using two GPUs
-    int devices[] = {0, 1};
-    parallel_do(devices, devices + 2, Worker());
-
-    // Release the first GPU resources
-    contextOn(0);
-    imshow("GPU #0 result", Mat(d_result[0]));
-    d_left[0].release();
-    d_right[0].release();
-    d_result[0].release();
-    delete bm[0];
-    contextOff();
-
-    // Release the second GPU resources
-    contextOn(1);
-    imshow("GPU #1 result", Mat(d_result[1]));
-    d_left[1].release();
-    d_right[1].release();
-    d_result[1].release();
-    delete bm[1];
-    contextOff();
-
-    waitKey();
-    destroyContexts();
-    return 0;
-}
-
-
-void Worker::operator()(int device_id) const
-{
-    contextOn(device_id);
-
-    bm[device_id]->operator()(d_left[device_id], d_right[device_id],
-                              d_result[device_id]);
-
-    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name()
-        << "): finished\n";
-
-    contextOff();
-}
-
-
-void destroyContexts()
-{
-    safeCall(cuCtxDestroy(contexts[0]));
-    safeCall(cuCtxDestroy(contexts[1]));
-}
-
-#endif
+/* This sample demonstrates working on one piece of data using two GPUs.
+   It splits input into two parts and processes them separately on different
+   GPUs. */
+
+// Disable some warnings which are caused with CUDA headers
+#if defined(_MSC_VER)
+#pragma warning(disable: 4201 4408 4100)
+#endif
+
+#include <iostream>
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+
+int main()
+{
+#if !defined(HAVE_CUDA)
+    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
+#endif
+
+#if !defined(HAVE_TBB)
+    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
+#endif
+
+    return 0;
+}
+
+#else
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "opencv2/core/internal.hpp" // For TBB wrappers
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+struct Worker { void operator()(int device_id) const; };
+void destroyContexts();
+
+#define safeCall(expr) safeCall_(expr, #expr, __FILE__, __LINE__)
+inline void safeCall_(int code, const char* expr, const char* file, int line)
+{
+    if (code != CUDA_SUCCESS)
+    {
+        std::cout << "CUDA driver API error: code " << code << ", expr " << expr
+            << ", file " << file << ", line " << line << endl;
+        destroyContexts();
+        exit(-1);
+    }
+}
+
+// Each GPU is associated with its own context
+CUcontext contexts[2];
+
+void inline contextOn(int id)
+{
+    safeCall(cuCtxPushCurrent(contexts[id]));
+}
+
+void inline contextOff()
+{
+    CUcontext prev_context;
+    safeCall(cuCtxPopCurrent(&prev_context));
+}
+
+// GPUs data
+GpuMat d_left[2];
+GpuMat d_right[2];
+StereoBM_GPU* bm[2];
+GpuMat d_result[2];
+
+// CPU result
+Mat result;
+
+void printHelp()
+{
+    std::cout << "Usage: driver_api_stereo_multi_gpu --left <left_image> --right <right_image>\n";
+}
+
+int main(int argc, char** argv)
+{
+    if (argc < 5)
+    {
+        printHelp();
+        return -1;
+    }
+
+    int num_devices = getCudaEnabledDeviceCount();
+    if (num_devices < 2)
+    {
+        std::cout << "Two or more GPUs are required\n";
+        return -1;
+    }
+
+    for (int i = 0; i < num_devices; ++i)
+    {
+        cv::gpu::printShortCudaDeviceInfo(i);
+
+        DeviceInfo dev_info(i);
+        if (!dev_info.isCompatible())
+        {
+            std::cout << "GPU module isn't built for GPU #" << i << " ("
+                 << dev_info.name() << ", CC " << dev_info.majorVersion()
+                 << dev_info.minorVersion() << "\n";
+            return -1;
+        }
+    }
+
+    // Load input data
+    Mat left, right;
+    for (int i = 1; i < argc; ++i)
+    {
+        if (string(argv[i]) == "--left")
+        {
+            left = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE);
+            CV_Assert(!left.empty());
+        }
+        else if (string(argv[i]) == "--right")
+        {
+            right = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE);
+            CV_Assert(!right.empty());
+        }
+        else if (string(argv[i]) == "--help")
+        {
+            printHelp();
+            return -1;
+        }
+    }
+
+
+    // Init CUDA Driver API
+    safeCall(cuInit(0));
+
+    // Create context for GPU #0
+    CUdevice device;
+    safeCall(cuDeviceGet(&device, 0));
+    safeCall(cuCtxCreate(&contexts[0], 0, device));
+    contextOff();
+
+    // Create context for GPU #1
+    safeCall(cuDeviceGet(&device, 1));
+    safeCall(cuCtxCreate(&contexts[1], 0, device));
+    contextOff();
+
+    // Split source images for processing on GPU #0
+    contextOn(0);
+    d_left[0].upload(left.rowRange(0, left.rows / 2));
+    d_right[0].upload(right.rowRange(0, right.rows / 2));
+    bm[0] = new StereoBM_GPU();
+    contextOff();
+
+    // Split source images for processing on the GPU #1
+    contextOn(1);
+    d_left[1].upload(left.rowRange(left.rows / 2, left.rows));
+    d_right[1].upload(right.rowRange(right.rows / 2, right.rows));
+    bm[1] = new StereoBM_GPU();
+    contextOff();
+
+    // Execute calculation in two threads using two GPUs
+    int devices[] = {0, 1};
+    parallel_do(devices, devices + 2, Worker());
+
+    // Release the first GPU resources
+    contextOn(0);
+    imshow("GPU #0 result", Mat(d_result[0]));
+    d_left[0].release();
+    d_right[0].release();
+    d_result[0].release();
+    delete bm[0];
+    contextOff();
+
+    // Release the second GPU resources
+    contextOn(1);
+    imshow("GPU #1 result", Mat(d_result[1]));
+    d_left[1].release();
+    d_right[1].release();
+    d_result[1].release();
+    delete bm[1];
+    contextOff();
+
+    waitKey();
+    destroyContexts();
+    return 0;
+}
+
+
+void Worker::operator()(int device_id) const
+{
+    contextOn(device_id);
+
+    bm[device_id]->operator()(d_left[device_id], d_right[device_id],
+                              d_result[device_id]);
+
+    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name()
+        << "): finished\n";
+
+    contextOff();
+}
+
+
+void destroyContexts()
+{
+    safeCall(cuCtxDestroy(contexts[0]));
+    safeCall(cuCtxDestroy(contexts[1]));
+}
+
+#endif
--- a/samples/gpu/highgui_gpu.cpp
+++ b/samples/gpu/highgui_gpu.cpp
@@ -1,135 +1,135 @@
-#include <iostream>
-#include <string>
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/core/gpumat.hpp"
-#include "opencv2/core/opengl_interop.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/contrib/contrib.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-struct Timer
-{
-    Timer(const string& msg_)
-    {
-        msg = msg_;
-
-        tm.reset();
-        tm.start();
-    }
-
-    ~Timer()
-    {
-        tm.stop();
-        cout << msg << " " << tm.getTimeMilli() << " ms\n";
-    }
-
-    string msg;
-    TickMeter tm;
-};
-
-int main(int argc, char* argv[])
-{
-    if (argc < 2)
-    {
-        cout << "Usage: " << argv[0] << " image" << endl;
-        return -1;
-    }
-
-    try
-    {
-        bool haveCuda = getCudaEnabledDeviceCount() > 0;
-
-        const string openGlMatWnd = "OpenGL Mat";
-        const string openGlBufferWnd = "OpenGL GlBuffer";
-        const string openGlTextureWnd = "OpenGL GlTexture";
-        const string openGlGpuMatWnd = "OpenGL GpuMat";
-        const string matWnd = "Mat";
-
-        namedWindow(openGlMatWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
-        namedWindow(openGlBufferWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
-        namedWindow(openGlTextureWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
-        if (haveCuda)
-            namedWindow(openGlGpuMatWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
-        namedWindow("Mat", WINDOW_AUTOSIZE);
-
-        Mat img = imread(argv[1]);
-        
-        if (haveCuda)
-            setGlDevice();
-
-        setOpenGlContext(openGlBufferWnd);
-        GlBuffer buf(img, GlBuffer::TEXTURE_BUFFER);
-
-        setOpenGlContext(openGlTextureWnd);
-        GlTexture tex(img);
-        
-        GpuMat d_img;
-        if (haveCuda)
-            d_img.upload(img);
-            
-        cout << "=== First call\n\n";
-
-        {
-            Timer t("OpenGL Mat      ");
-            imshow(openGlMatWnd, img);
-        }
-        {
-            Timer t("OpenGL GlBuffer ");
-            imshow(openGlBufferWnd, buf);
-        }
-        {
-            Timer t("OpenGL GlTexture");
-            imshow(openGlTextureWnd, tex);
-        }
-        if (haveCuda)
-        {
-            Timer t("OpenGL GpuMat   ");
-            imshow(openGlGpuMatWnd, d_img);
-        }
-        {
-            Timer t("Mat             ");
-            imshow(matWnd, img);
-        }
-
-        waitKey();
-
-        cout << "\n=== Second call\n\n";   
-
-        {
-            Timer t("OpenGL Mat      ");
-            imshow(openGlMatWnd, img);
-        }
-        {
-            Timer t("OpenGL GlBuffer ");
-            imshow(openGlBufferWnd, buf);
-        }
-        {
-            Timer t("OpenGL GlTexture");
-            imshow(openGlTextureWnd, tex);
-        }
-        if (haveCuda)
-        {
-            Timer t("OpenGL GpuMat   ");
-            imshow(openGlGpuMatWnd, d_img);
-        }
-        {
-            Timer t("Mat             ");
-            imshow(matWnd, img);
-        }
-
-        cout << "\n";
-
-        waitKey();
-    }
-    catch(const exception& e)
-    {
-        cout << e.what() << endl;
-    }
-
-    return 0;
-}
+#include <iostream>
+#include <string>
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/opengl_interop.hpp"
+#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/contrib/contrib.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+struct Timer
+{
+    Timer(const string& msg_)
+    {
+        msg = msg_;
+
+        tm.reset();
+        tm.start();
+    }
+
+    ~Timer()
+    {
+        tm.stop();
+        cout << msg << " " << tm.getTimeMilli() << " ms\n";
+    }
+
+    string msg;
+    TickMeter tm;
+};
+
+int main(int argc, char* argv[])
+{
+    if (argc < 2)
+    {
+        cout << "Usage: " << argv[0] << " image" << endl;
+        return -1;
+    }
+
+    try
+    {
+        bool haveCuda = getCudaEnabledDeviceCount() > 0;
+
+        const string openGlMatWnd = "OpenGL Mat";
+        const string openGlBufferWnd = "OpenGL GlBuffer";
+        const string openGlTextureWnd = "OpenGL GlTexture";
+        const string openGlGpuMatWnd = "OpenGL GpuMat";
+        const string matWnd = "Mat";
+
+        namedWindow(openGlMatWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
+        namedWindow(openGlBufferWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
+        namedWindow(openGlTextureWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
+        if (haveCuda)
+            namedWindow(openGlGpuMatWnd, WINDOW_OPENGL | WINDOW_AUTOSIZE);
+        namedWindow("Mat", WINDOW_AUTOSIZE);
+
+        Mat img = imread(argv[1]);
+
+        if (haveCuda)
+            setGlDevice();
+
+        setOpenGlContext(openGlBufferWnd);
+        GlBuffer buf(img, GlBuffer::TEXTURE_BUFFER);
+
+        setOpenGlContext(openGlTextureWnd);
+        GlTexture tex(img);
+
+        GpuMat d_img;
+        if (haveCuda)
+            d_img.upload(img);
+
+        cout << "=== First call\n\n";
+
+        {
+            Timer t("OpenGL Mat      ");
+            imshow(openGlMatWnd, img);
+        }
+        {
+            Timer t("OpenGL GlBuffer ");
+            imshow(openGlBufferWnd, buf);
+        }
+        {
+            Timer t("OpenGL GlTexture");
+            imshow(openGlTextureWnd, tex);
+        }
+        if (haveCuda)
+        {
+            Timer t("OpenGL GpuMat   ");
+            imshow(openGlGpuMatWnd, d_img);
+        }
+        {
+            Timer t("Mat             ");
+            imshow(matWnd, img);
+        }
+
+        waitKey();
+
+        cout << "\n=== Second call\n\n";
+
+        {
+            Timer t("OpenGL Mat      ");
+            imshow(openGlMatWnd, img);
+        }
+        {
+            Timer t("OpenGL GlBuffer ");
+            imshow(openGlBufferWnd, buf);
+        }
+        {
+            Timer t("OpenGL GlTexture");
+            imshow(openGlTextureWnd, tex);
+        }
+        if (haveCuda)
+        {
+            Timer t("OpenGL GpuMat   ");
+            imshow(openGlGpuMatWnd, d_img);
+        }
+        {
+            Timer t("Mat             ");
+            imshow(matWnd, img);
+        }
+
+        cout << "\n";
+
+        waitKey();
+    }
+    catch(const exception& e)
+    {
+        cout << e.what() << endl;
+    }
+
+    return 0;
+}
--- a/samples/gpu/hog.cpp
+++ b/samples/gpu/hog.cpp
@@ -1,459 +1,459 @@
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <sstream>
-#include <iomanip>
-#include <stdexcept>
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/highgui/highgui.hpp"
-
-using namespace std;
-using namespace cv;
-
-bool help_showed = false;
-
-class Args
-{
-public:
-    Args();
-    static Args read(int argc, char** argv);
-
-    string src;
-    bool src_is_video;
-    bool src_is_camera;
-    int camera_id;
-
-    bool write_video;
-    string dst_video;
-    double dst_video_fps;
-
-    bool make_gray;
-
-    bool resize_src;
-    int width, height;
-
-    double scale;
-    int nlevels;
-    int gr_threshold;
-
-    double hit_threshold;
-    bool hit_threshold_auto;
-
-    int win_width;
-    int win_stride_width, win_stride_height;
-
-    bool gamma_corr;
-};
-
-
-class App
-{
-public:
-    App(const Args& s);
-    void run();
-
-    void handleKey(char key);
-
-    void hogWorkBegin();
-    void hogWorkEnd();
-    string hogWorkFps() const;
-
-    void workBegin();
-    void workEnd();
-    string workFps() const;
-
-    string message() const;
-
-private:
-    App operator=(App&);
-
-    Args args;
-    bool running;
-
-    bool use_gpu;
-    bool make_gray;
-    double scale;
-    int gr_threshold;
-    int nlevels;
-    double hit_threshold;
-    bool gamma_corr;
-
-    int64 hog_work_begin;
-    double hog_work_fps;
-
-    int64 work_begin;
-    double work_fps;
-};
-
-static void printHelp()
-{
-    cout << "Histogram of Oriented Gradients descriptor and detector sample.\n"
-         << "\nUsage: hog_gpu\n"
-         << "  (<image>|--video <vide>|--camera <camera_id>) # frames source\n"
-         << "  [--make_gray <true/false>] # convert image to gray one or not\n"
-         << "  [--resize_src <true/false>] # do resize of the source image or not\n"
-         << "  [--width <int>] # resized image width\n"
-         << "  [--height <int>] # resized image height\n"
-         << "  [--hit_threshold <double>] # classifying plane distance threshold (0.0 usually)\n"
-         << "  [--scale <double>] # HOG window scale factor\n"
-         << "  [--nlevels <int>] # max number of HOG window scales\n"
-         << "  [--win_width <int>] # width of the window (48 or 64)\n"
-         << "  [--win_stride_width <int>] # distance by OX axis between neighbour wins\n"
-         << "  [--win_stride_height <int>] # distance by OY axis between neighbour wins\n"
-         << "  [--gr_threshold <int>] # merging similar rects constant\n"
-         << "  [--gamma_correct <int>] # do gamma correction or not\n"
-         << "  [--write_video <bool>] # write video or not\n"
-         << "  [--dst_video <path>] # output video path\n"
-         << "  [--dst_video_fps <double>] # output video fps\n";
-    help_showed = true;
-}
-
-int main(int argc, char** argv)
-{
-    try
-    {
-        if (argc < 2)
-            printHelp();
-        Args args = Args::read(argc, argv);
-        if (help_showed)
-            return -1;
-        App app(args);
-        app.run();
-    }
-    catch (const Exception& e) { return cout << "error: "  << e.what() << endl, 1; }
-    catch (const exception& e) { return cout << "error: "  << e.what() << endl, 1; }
-    catch(...) { return cout << "unknown exception" << endl, 1; }
-    return 0;
-}
-
-
-Args::Args()
-{
-    src_is_video = false;
-    src_is_camera = false;
-    camera_id = 0;
-
-    write_video = false;
-    dst_video_fps = 24.;
-
-    make_gray = false;
-
-    resize_src = false;
-    width = 640;
-    height = 480;
-
-    scale = 1.05;
-    nlevels = 13;
-    gr_threshold = 8;
-    hit_threshold = 1.4;
-    hit_threshold_auto = true;
-
-    win_width = 48;
-    win_stride_width = 8;
-    win_stride_height = 8;
-
-    gamma_corr = true;
-}
-
-
-Args Args::read(int argc, char** argv)
-{
-    Args args;
-    for (int i = 1; i < argc; i++)
-    {
-        if (string(argv[i]) == "--make_gray") args.make_gray = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--resize_src") args.resize_src = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--width") args.width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--height") args.height = atoi(argv[++i]);
-        else if (string(argv[i]) == "--hit_threshold")
-        {
-            args.hit_threshold = atof(argv[++i]);
-            args.hit_threshold_auto = false;
-        }
-        else if (string(argv[i]) == "--scale") args.scale = atof(argv[++i]);
-        else if (string(argv[i]) == "--nlevels") args.nlevels = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_width") args.win_width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_stride_width") args.win_stride_width = atoi(argv[++i]);
-        else if (string(argv[i]) == "--win_stride_height") args.win_stride_height = atoi(argv[++i]);
-        else if (string(argv[i]) == "--gr_threshold") args.gr_threshold = atoi(argv[++i]);
-        else if (string(argv[i]) == "--gamma_correct") args.gamma_corr = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--write_video") args.write_video = (string(argv[++i]) == "true");
-        else if (string(argv[i]) == "--dst_video") args.dst_video = argv[++i];
-        else if (string(argv[i]) == "--dst_video_fps") args.dst_video_fps = atof(argv[++i]);
-        else if (string(argv[i]) == "--help") printHelp();
-        else if (string(argv[i]) == "--video") { args.src = argv[++i]; args.src_is_video = true; }
-        else if (string(argv[i]) == "--camera") { args.camera_id = atoi(argv[++i]); args.src_is_camera = true; }
-        else if (args.src.empty()) args.src = argv[i];
-        else throw runtime_error((string("unknown key: ") + argv[i]));
-    }
-    return args;
-}
-
-
-App::App(const Args& s)
-{
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    args = s;
-    cout << "\nControls:\n"
-         << "\tESC - exit\n"
-         << "\tm - change mode GPU <-> CPU\n"
-         << "\tg - convert image to gray or not\n"
-         << "\t1/q - increase/decrease HOG scale\n"
-         << "\t2/w - increase/decrease levels count\n"
-         << "\t3/e - increase/decrease HOG group threshold\n"
-         << "\t4/r - increase/decrease hit threshold\n"
-         << endl;
-
-    use_gpu = true;
-    make_gray = args.make_gray;
-    scale = args.scale;
-    gr_threshold = args.gr_threshold;
-    nlevels = args.nlevels;
-
-    if (args.hit_threshold_auto)
-        args.hit_threshold = args.win_width == 48 ? 1.4 : 0.;
-    hit_threshold = args.hit_threshold;
-
-    gamma_corr = args.gamma_corr;
-
-    if (args.win_width != 64 && args.win_width != 48)
-        args.win_width = 64;
-
-    cout << "Scale: " << scale << endl;
-    if (args.resize_src)
-        cout << "Resized source: (" << args.width << ", " << args.height << ")\n";
-    cout << "Group threshold: " << gr_threshold << endl;
-    cout << "Levels number: " << nlevels << endl;
-    cout << "Win width: " << args.win_width << endl;
-    cout << "Win stride: (" << args.win_stride_width << ", " << args.win_stride_height << ")\n";
-    cout << "Hit threshold: " << hit_threshold << endl;
-    cout << "Gamma correction: " << gamma_corr << endl;
-    cout << endl;
-}
-
-
-void App::run()
-{
-    running = true;
-    cv::VideoWriter video_writer;
-
-    Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
-    Size win_stride(args.win_stride_width, args.win_stride_height);
-
-    // Create HOG descriptors and detectors here
-    vector<float> detector;
-    if (win_size == Size(64, 128))
-        detector = cv::gpu::HOGDescriptor::getPeopleDetector64x128();
-    else
-        detector = cv::gpu::HOGDescriptor::getPeopleDetector48x96();
-
-    cv::gpu::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::gpu::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::gpu::HOGDescriptor::DEFAULT_NLEVELS);
-    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
-                              HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
-    gpu_hog.setSVMDetector(detector);
-    cpu_hog.setSVMDetector(detector);
-
-    while (running)
-    {
-        VideoCapture vc;
-        Mat frame;
-
-        if (args.src_is_video)
-        {
-            vc.open(args.src.c_str());
-            if (!vc.isOpened())
-                throw runtime_error(string("can't open video file: " + args.src));
-            vc >> frame;
-        }
-        else if (args.src_is_camera)
-        {
-            vc.open(args.camera_id);
-            if (!vc.isOpened())
-            {
-                stringstream msg;
-                msg << "can't open camera: " << args.camera_id;
-                throw runtime_error(msg.str());
-            }
-            vc >> frame;
-        }
-        else
-        {
-            frame = imread(args.src);
-            if (frame.empty())
-                throw runtime_error(string("can't open image file: " + args.src));
-        }
-
-        Mat img_aux, img, img_to_show;
-        gpu::GpuMat gpu_img;
-
-        // Iterate over all frames
-        while (running && !frame.empty())
-        {
-            workBegin();
-
-            // Change format of the image
-            if (make_gray) cvtColor(frame, img_aux, CV_BGR2GRAY);
-            else if (use_gpu) cvtColor(frame, img_aux, CV_BGR2BGRA);
-            else frame.copyTo(img_aux);
-
-            // Resize image
-            if (args.resize_src) resize(img_aux, img, Size(args.width, args.height));
-            else img = img_aux;
-            img_to_show = img;
-
-            gpu_hog.nlevels = nlevels;
-            cpu_hog.nlevels = nlevels;
-
-            vector<Rect> found;
-
-            // Perform HOG classification
-            hogWorkBegin();
-            if (use_gpu)
-            {
-                gpu_img.upload(img);
-                gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
-                                         Size(0, 0), scale, gr_threshold);
-            }
-            else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
-                                          Size(0, 0), scale, gr_threshold);
-            hogWorkEnd();
-
-            // Draw positive classified windows
-            for (size_t i = 0; i < found.size(); i++)
-            {
-                Rect r = found[i];
-                rectangle(img_to_show, r.tl(), r.br(), CV_RGB(0, 255, 0), 3);
-            }
-
-            if (use_gpu)
-                putText(img_to_show, "Mode: GPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
-            else
-                putText(img_to_show, "Mode: CPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
-            putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
-            putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
-            imshow("opencv_gpu_hog", img_to_show);
-
-            if (args.src_is_video || args.src_is_camera) vc >> frame;
-
-            workEnd();
-
-            if (args.write_video)
-            {
-                if (!video_writer.isOpened())
-                {
-                    video_writer.open(args.dst_video, CV_FOURCC('x','v','i','d'), args.dst_video_fps,
-                                      img_to_show.size(), true);
-                    if (!video_writer.isOpened())
-                        throw std::runtime_error("can't create video writer");
-                }
-
-                if (make_gray) cvtColor(img_to_show, img, CV_GRAY2BGR);
-                else cvtColor(img_to_show, img, CV_BGRA2BGR);
-
-                video_writer << img;
-            }
-
-            handleKey((char)waitKey(3));
-        }
-    }
-}
-
-
-void App::handleKey(char key)
-{
-    switch (key)
-    {
-    case 27:
-        running = false;
-        break;
-    case 'm':
-    case 'M':
-        use_gpu = !use_gpu;
-        cout << "Switched to " << (use_gpu ? "CUDA" : "CPU") << " mode\n";
-        break;
-    case 'g':
-    case 'G':
-        make_gray = !make_gray;
-        cout << "Convert image to gray: " << (make_gray ? "YES" : "NO") << endl;
-        break;
-    case '1':
-        scale *= 1.05;
-        cout << "Scale: " << scale << endl;
-        break;
-    case 'q':
-    case 'Q':
-        scale /= 1.05;
-        cout << "Scale: " << scale << endl;
-        break;
-    case '2':
-        nlevels++;
-        cout << "Levels number: " << nlevels << endl;
-        break;
-    case 'w':
-    case 'W':
-        nlevels = max(nlevels - 1, 1);
-        cout << "Levels number: " << nlevels << endl;
-        break;
-    case '3':
-        gr_threshold++;
-        cout << "Group threshold: " << gr_threshold << endl;
-        break;
-    case 'e':
-    case 'E':
-        gr_threshold = max(0, gr_threshold - 1);
-        cout << "Group threshold: " << gr_threshold << endl;
-        break;
-    case '4':
-        hit_threshold+=0.25;
-        cout << "Hit threshold: " << hit_threshold << endl;
-        break;
-    case 'r':
-    case 'R':
-        hit_threshold = max(0.0, hit_threshold - 0.25);
-        cout << "Hit threshold: " << hit_threshold << endl;
-        break;
-    case 'c':
-    case 'C':
-        gamma_corr = !gamma_corr;
-        cout << "Gamma correction: " << gamma_corr << endl;
-        break;
-    }
-}
-
-
-inline void App::hogWorkBegin() { hog_work_begin = getTickCount(); }
-
-inline void App::hogWorkEnd()
-{
-    int64 delta = getTickCount() - hog_work_begin;
-    double freq = getTickFrequency();
-    hog_work_fps = freq / delta;
-}
-
-inline string App::hogWorkFps() const
-{
-    stringstream ss;
-    ss << hog_work_fps;
-    return ss.str();
-}
-
-
-inline void App::workBegin() { work_begin = getTickCount(); }
-
-inline void App::workEnd()
-{
-    int64 delta = getTickCount() - work_begin;
-    double freq = getTickFrequency();
-    work_fps = freq / delta;
-}
-
-inline string App::workFps() const
-{
-    stringstream ss;
-    ss << work_fps;
-    return ss.str();
-}
-
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+using namespace std;
+using namespace cv;
+
+bool help_showed = false;
+
+class Args
+{
+public:
+    Args();
+    static Args read(int argc, char** argv);
+
+    string src;
+    bool src_is_video;
+    bool src_is_camera;
+    int camera_id;
+
+    bool write_video;
+    string dst_video;
+    double dst_video_fps;
+
+    bool make_gray;
+
+    bool resize_src;
+    int width, height;
+
+    double scale;
+    int nlevels;
+    int gr_threshold;
+
+    double hit_threshold;
+    bool hit_threshold_auto;
+
+    int win_width;
+    int win_stride_width, win_stride_height;
+
+    bool gamma_corr;
+};
+
+
+class App
+{
+public:
+    App(const Args& s);
+    void run();
+
+    void handleKey(char key);
+
+    void hogWorkBegin();
+    void hogWorkEnd();
+    string hogWorkFps() const;
+
+    void workBegin();
+    void workEnd();
+    string workFps() const;
+
+    string message() const;
+
+private:
+    App operator=(App&);
+
+    Args args;
+    bool running;
+
+    bool use_gpu;
+    bool make_gray;
+    double scale;
+    int gr_threshold;
+    int nlevels;
+    double hit_threshold;
+    bool gamma_corr;
+
+    int64 hog_work_begin;
+    double hog_work_fps;
+
+    int64 work_begin;
+    double work_fps;
+};
+
+static void printHelp()
+{
+    cout << "Histogram of Oriented Gradients descriptor and detector sample.\n"
+         << "\nUsage: hog_gpu\n"
+         << "  (<image>|--video <vide>|--camera <camera_id>) # frames source\n"
+         << "  [--make_gray <true/false>] # convert image to gray one or not\n"
+         << "  [--resize_src <true/false>] # do resize of the source image or not\n"
+         << "  [--width <int>] # resized image width\n"
+         << "  [--height <int>] # resized image height\n"
+         << "  [--hit_threshold <double>] # classifying plane distance threshold (0.0 usually)\n"
+         << "  [--scale <double>] # HOG window scale factor\n"
+         << "  [--nlevels <int>] # max number of HOG window scales\n"
+         << "  [--win_width <int>] # width of the window (48 or 64)\n"
+         << "  [--win_stride_width <int>] # distance by OX axis between neighbour wins\n"
+         << "  [--win_stride_height <int>] # distance by OY axis between neighbour wins\n"
+         << "  [--gr_threshold <int>] # merging similar rects constant\n"
+         << "  [--gamma_correct <int>] # do gamma correction or not\n"
+         << "  [--write_video <bool>] # write video or not\n"
+         << "  [--dst_video <path>] # output video path\n"
+         << "  [--dst_video_fps <double>] # output video fps\n";
+    help_showed = true;
+}
+
+int main(int argc, char** argv)
+{
+    try
+    {
+        if (argc < 2)
+            printHelp();
+        Args args = Args::read(argc, argv);
+        if (help_showed)
+            return -1;
+        App app(args);
+        app.run();
+    }
+    catch (const Exception& e) { return cout << "error: "  << e.what() << endl, 1; }
+    catch (const exception& e) { return cout << "error: "  << e.what() << endl, 1; }
+    catch(...) { return cout << "unknown exception" << endl, 1; }
+    return 0;
+}
+
+
+Args::Args()
+{
+    src_is_video = false;
+    src_is_camera = false;
+    camera_id = 0;
+
+    write_video = false;
+    dst_video_fps = 24.;
+
+    make_gray = false;
+
+    resize_src = false;
+    width = 640;
+    height = 480;
+
+    scale = 1.05;
+    nlevels = 13;
+    gr_threshold = 8;
+    hit_threshold = 1.4;
+    hit_threshold_auto = true;
+
+    win_width = 48;
+    win_stride_width = 8;
+    win_stride_height = 8;
+
+    gamma_corr = true;
+}
+
+
+Args Args::read(int argc, char** argv)
+{
+    Args args;
+    for (int i = 1; i < argc; i++)
+    {
+        if (string(argv[i]) == "--make_gray") args.make_gray = (string(argv[++i]) == "true");
+        else if (string(argv[i]) == "--resize_src") args.resize_src = (string(argv[++i]) == "true");
+        else if (string(argv[i]) == "--width") args.width = atoi(argv[++i]);
+        else if (string(argv[i]) == "--height") args.height = atoi(argv[++i]);
+        else if (string(argv[i]) == "--hit_threshold")
+        {
+            args.hit_threshold = atof(argv[++i]);
+            args.hit_threshold_auto = false;
+        }
+        else if (string(argv[i]) == "--scale") args.scale = atof(argv[++i]);
+        else if (string(argv[i]) == "--nlevels") args.nlevels = atoi(argv[++i]);
+        else if (string(argv[i]) == "--win_width") args.win_width = atoi(argv[++i]);
+        else if (string(argv[i]) == "--win_stride_width") args.win_stride_width = atoi(argv[++i]);
+        else if (string(argv[i]) == "--win_stride_height") args.win_stride_height = atoi(argv[++i]);
+        else if (string(argv[i]) == "--gr_threshold") args.gr_threshold = atoi(argv[++i]);
+        else if (string(argv[i]) == "--gamma_correct") args.gamma_corr = (string(argv[++i]) == "true");
+        else if (string(argv[i]) == "--write_video") args.write_video = (string(argv[++i]) == "true");
+        else if (string(argv[i]) == "--dst_video") args.dst_video = argv[++i];
+        else if (string(argv[i]) == "--dst_video_fps") args.dst_video_fps = atof(argv[++i]);
+        else if (string(argv[i]) == "--help") printHelp();
+        else if (string(argv[i]) == "--video") { args.src = argv[++i]; args.src_is_video = true; }
+        else if (string(argv[i]) == "--camera") { args.camera_id = atoi(argv[++i]); args.src_is_camera = true; }
+        else if (args.src.empty()) args.src = argv[i];
+        else throw runtime_error((string("unknown key: ") + argv[i]));
+    }
+    return args;
+}
+
+
+App::App(const Args& s)
+{
+    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
+
+    args = s;
+    cout << "\nControls:\n"
+         << "\tESC - exit\n"
+         << "\tm - change mode GPU <-> CPU\n"
+         << "\tg - convert image to gray or not\n"
+         << "\t1/q - increase/decrease HOG scale\n"
+         << "\t2/w - increase/decrease levels count\n"
+         << "\t3/e - increase/decrease HOG group threshold\n"
+         << "\t4/r - increase/decrease hit threshold\n"
+         << endl;
+
+    use_gpu = true;
+    make_gray = args.make_gray;
+    scale = args.scale;
+    gr_threshold = args.gr_threshold;
+    nlevels = args.nlevels;
+
+    if (args.hit_threshold_auto)
+        args.hit_threshold = args.win_width == 48 ? 1.4 : 0.;
+    hit_threshold = args.hit_threshold;
+
+    gamma_corr = args.gamma_corr;
+
+    if (args.win_width != 64 && args.win_width != 48)
+        args.win_width = 64;
+
+    cout << "Scale: " << scale << endl;
+    if (args.resize_src)
+        cout << "Resized source: (" << args.width << ", " << args.height << ")\n";
+    cout << "Group threshold: " << gr_threshold << endl;
+    cout << "Levels number: " << nlevels << endl;
+    cout << "Win width: " << args.win_width << endl;
+    cout << "Win stride: (" << args.win_stride_width << ", " << args.win_stride_height << ")\n";
+    cout << "Hit threshold: " << hit_threshold << endl;
+    cout << "Gamma correction: " << gamma_corr << endl;
+    cout << endl;
+}
+
+
+void App::run()
+{
+    running = true;
+    cv::VideoWriter video_writer;
+
+    Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
+    Size win_stride(args.win_stride_width, args.win_stride_height);
+
+    // Create HOG descriptors and detectors here
+    vector<float> detector;
+    if (win_size == Size(64, 128))
+        detector = cv::gpu::HOGDescriptor::getPeopleDetector64x128();
+    else
+        detector = cv::gpu::HOGDescriptor::getPeopleDetector48x96();
+
+    cv::gpu::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
+                                   cv::gpu::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
+                                   cv::gpu::HOGDescriptor::DEFAULT_NLEVELS);
+    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
+                              HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
+    gpu_hog.setSVMDetector(detector);
+    cpu_hog.setSVMDetector(detector);
+
+    while (running)
+    {
+        VideoCapture vc;
+        Mat frame;
+
+        if (args.src_is_video)
+        {
+            vc.open(args.src.c_str());
+            if (!vc.isOpened())
+                throw runtime_error(string("can't open video file: " + args.src));
+            vc >> frame;
+        }
+        else if (args.src_is_camera)
+        {
+            vc.open(args.camera_id);
+            if (!vc.isOpened())
+            {
+                stringstream msg;
+                msg << "can't open camera: " << args.camera_id;
+                throw runtime_error(msg.str());
+            }
+            vc >> frame;
+        }
+        else
+        {
+            frame = imread(args.src);
+            if (frame.empty())
+                throw runtime_error(string("can't open image file: " + args.src));
+        }
+
+        Mat img_aux, img, img_to_show;
+        gpu::GpuMat gpu_img;
+
+        // Iterate over all frames
+        while (running && !frame.empty())
+        {
+            workBegin();
+
+            // Change format of the image
+            if (make_gray) cvtColor(frame, img_aux, CV_BGR2GRAY);
+            else if (use_gpu) cvtColor(frame, img_aux, CV_BGR2BGRA);
+            else frame.copyTo(img_aux);
+
+            // Resize image
+            if (args.resize_src) resize(img_aux, img, Size(args.width, args.height));
+            else img = img_aux;
+            img_to_show = img;
+
+            gpu_hog.nlevels = nlevels;
+            cpu_hog.nlevels = nlevels;
+
+            vector<Rect> found;
+
+            // Perform HOG classification
+            hogWorkBegin();
+            if (use_gpu)
+            {
+                gpu_img.upload(img);
+                gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
+                                         Size(0, 0), scale, gr_threshold);
+            }
+            else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
+                                          Size(0, 0), scale, gr_threshold);
+            hogWorkEnd();
+
+            // Draw positive classified windows
+            for (size_t i = 0; i < found.size(); i++)
+            {
+                Rect r = found[i];
+                rectangle(img_to_show, r.tl(), r.br(), CV_RGB(0, 255, 0), 3);
+            }
+
+            if (use_gpu)
+                putText(img_to_show, "Mode: GPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
+            else
+                putText(img_to_show, "Mode: CPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
+            putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
+            putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
+            imshow("opencv_gpu_hog", img_to_show);
+
+            if (args.src_is_video || args.src_is_camera) vc >> frame;
+
+            workEnd();
+
+            if (args.write_video)
+            {
+                if (!video_writer.isOpened())
+                {
+                    video_writer.open(args.dst_video, CV_FOURCC('x','v','i','d'), args.dst_video_fps,
+                                      img_to_show.size(), true);
+                    if (!video_writer.isOpened())
+                        throw std::runtime_error("can't create video writer");
+                }
+
+                if (make_gray) cvtColor(img_to_show, img, CV_GRAY2BGR);
+                else cvtColor(img_to_show, img, CV_BGRA2BGR);
+
+                video_writer << img;
+            }
+
+            handleKey((char)waitKey(3));
+        }
+    }
+}
+
+
+void App::handleKey(char key)
+{
+    switch (key)
+    {
+    case 27:
+        running = false;
+        break;
+    case 'm':
+    case 'M':
+        use_gpu = !use_gpu;
+        cout << "Switched to " << (use_gpu ? "CUDA" : "CPU") << " mode\n";
+        break;
+    case 'g':
+    case 'G':
+        make_gray = !make_gray;
+        cout << "Convert image to gray: " << (make_gray ? "YES" : "NO") << endl;
+        break;
+    case '1':
+        scale *= 1.05;
+        cout << "Scale: " << scale << endl;
+        break;
+    case 'q':
+    case 'Q':
+        scale /= 1.05;
+        cout << "Scale: " << scale << endl;
+        break;
+    case '2':
+        nlevels++;
+        cout << "Levels number: " << nlevels << endl;
+        break;
+    case 'w':
+    case 'W':
+        nlevels = max(nlevels - 1, 1);
+        cout << "Levels number: " << nlevels << endl;
+        break;
+    case '3':
+        gr_threshold++;
+        cout << "Group threshold: " << gr_threshold << endl;
+        break;
+    case 'e':
+    case 'E':
+        gr_threshold = max(0, gr_threshold - 1);
+        cout << "Group threshold: " << gr_threshold << endl;
+        break;
+    case '4':
+        hit_threshold+=0.25;
+        cout << "Hit threshold: " << hit_threshold << endl;
+        break;
+    case 'r':
+    case 'R':
+        hit_threshold = max(0.0, hit_threshold - 0.25);
+        cout << "Hit threshold: " << hit_threshold << endl;
+        break;
+    case 'c':
+    case 'C':
+        gamma_corr = !gamma_corr;
+        cout << "Gamma correction: " << gamma_corr << endl;
+        break;
+    }
+}
+
+
+inline void App::hogWorkBegin() { hog_work_begin = getTickCount(); }
+
+inline void App::hogWorkEnd()
+{
+    int64 delta = getTickCount() - hog_work_begin;
+    double freq = getTickFrequency();
+    hog_work_fps = freq / delta;
+}
+
+inline string App::hogWorkFps() const
+{
+    stringstream ss;
+    ss << hog_work_fps;
+    return ss.str();
+}
+
+
+inline void App::workBegin() { work_begin = getTickCount(); }
+
+inline void App::workEnd()
+{
+    int64 delta = getTickCount() - work_begin;
+    double freq = getTickFrequency();
+    work_fps = freq / delta;
+}
+
+inline string App::workFps() const
+{
+    stringstream ss;
+    ss << work_fps;
+    return ss.str();
+}
+
--- a/samples/gpu/morfology.cpp
+++ b/samples/gpu/morfology.cpp
@@ -1,119 +1,119 @@
-
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include <stdlib.h>
-#include <stdio.h>
-
-using namespace cv;
-using namespace cv::gpu;
-
-static void help()
-{
-
-printf("\nShow off image morphology: erosion, dialation, open and close\n"
-    "Call:\n   morphology2 [image]\n"
-    "This program also shows use of rect, elipse and cross kernels\n\n");
-printf( "Hot keys: \n"
-    "\tESC - quit the program\n"
-    "\tr - use rectangle structuring element\n"
-    "\te - use elliptic structuring element\n"
-    "\tc - use cross-shaped structuring element\n"
-    "\tSPACE - loop through all the options\n" );
-}
-
-GpuMat src, dst;
-
-int element_shape = MORPH_RECT;
-
-//the address of variable which receives trackbar position update
-int max_iters = 10;
-int open_close_pos = 0;
-int erode_dilate_pos = 0;
-
-// callback function for open/close trackbar
-static void OpenClose(int, void*)
-{
-    int n = open_close_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::morphologyEx(src, dst, CV_MOP_OPEN, element);
-    else
-        cv::gpu::morphologyEx(src, dst, CV_MOP_CLOSE, element);
-    imshow("Open/Close",(Mat)dst);
-}
-
-// callback function for erode/dilate trackbar
-static void ErodeDilate(int, void*)
-{
-    int n = erode_dilate_pos - max_iters;
-    int an = n > 0 ? n : -n;
-    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
-    if( n < 0 )
-        cv::gpu::erode(src, dst, element);
-    else
-        cv::gpu::dilate(src, dst, element);
-    imshow("Erode/Dilate",(Mat)dst);
-}
-
-
-int main( int argc, char** argv )
-{
-    char* filename = argc == 2 ? argv[1] : (char*)"baboon.jpg";
-    if (string(argv[1]) == "--help")
-    {
-        help();
-        return -1;
-    }
-
-    src.upload(imread(filename, 1));
-    if (src.empty())
-    {
-        help();
-        return -1;
-    }
-
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    help();
-
-
-    if (src.channels() == 3)
-    {
-        // gpu support only 4th channel images
-        GpuMat src4ch;
-        cv::gpu::cvtColor(src, src4ch, CV_BGR2BGRA);
-        src = src4ch;
-    }
-
-    //create windows for output images
-    namedWindow("Open/Close",1);
-    namedWindow("Erode/Dilate",1);
-
-    open_close_pos = erode_dilate_pos = max_iters;
-    createTrackbar("iterations", "Open/Close",&open_close_pos,max_iters*2+1,OpenClose);
-    createTrackbar("iterations", "Erode/Dilate",&erode_dilate_pos,max_iters*2+1,ErodeDilate);
-
-    for(;;)
-    {
-        int c;
-
-        OpenClose(open_close_pos, 0);
-        ErodeDilate(erode_dilate_pos, 0);
-        c = cvWaitKey(0);
-
-        if( (char)c == 27 )
-            break;
-        if( (char)c == 'e' )
-            element_shape = MORPH_ELLIPSE;
-        else if( (char)c == 'r' )
-            element_shape = MORPH_RECT;
-        else if( (char)c == 'c' )
-            element_shape = MORPH_CROSS;
-        else if( (char)c == ' ' )
-            element_shape = (element_shape + 1) % 3;
-    }
-
-    return 0;
-}
+
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+#include <stdlib.h>
+#include <stdio.h>
+
+using namespace cv;
+using namespace cv::gpu;
+
+static void help()
+{
+
+printf("\nShow off image morphology: erosion, dialation, open and close\n"
+    "Call:\n   morphology2 [image]\n"
+    "This program also shows use of rect, elipse and cross kernels\n\n");
+printf( "Hot keys: \n"
+    "\tESC - quit the program\n"
+    "\tr - use rectangle structuring element\n"
+    "\te - use elliptic structuring element\n"
+    "\tc - use cross-shaped structuring element\n"
+    "\tSPACE - loop through all the options\n" );
+}
+
+GpuMat src, dst;
+
+int element_shape = MORPH_RECT;
+
+//the address of variable which receives trackbar position update
+int max_iters = 10;
+int open_close_pos = 0;
+int erode_dilate_pos = 0;
+
+// callback function for open/close trackbar
+static void OpenClose(int, void*)
+{
+    int n = open_close_pos - max_iters;
+    int an = n > 0 ? n : -n;
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
+    if( n < 0 )
+        cv::gpu::morphologyEx(src, dst, CV_MOP_OPEN, element);
+    else
+        cv::gpu::morphologyEx(src, dst, CV_MOP_CLOSE, element);
+    imshow("Open/Close",(Mat)dst);
+}
+
+// callback function for erode/dilate trackbar
+static void ErodeDilate(int, void*)
+{
+    int n = erode_dilate_pos - max_iters;
+    int an = n > 0 ? n : -n;
+    Mat element = getStructuringElement(element_shape, Size(an*2+1, an*2+1), Point(an, an) );
+    if( n < 0 )
+        cv::gpu::erode(src, dst, element);
+    else
+        cv::gpu::dilate(src, dst, element);
+    imshow("Erode/Dilate",(Mat)dst);
+}
+
+
+int main( int argc, char** argv )
+{
+    char* filename = argc == 2 ? argv[1] : (char*)"baboon.jpg";
+    if (string(argv[1]) == "--help")
+    {
+        help();
+        return -1;
+    }
+
+    src.upload(imread(filename, 1));
+    if (src.empty())
+    {
+        help();
+        return -1;
+    }
+
+    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
+
+    help();
+
+
+    if (src.channels() == 3)
+    {
+        // gpu support only 4th channel images
+        GpuMat src4ch;
+        cv::gpu::cvtColor(src, src4ch, CV_BGR2BGRA);
+        src = src4ch;
+    }
+
+    //create windows for output images
+    namedWindow("Open/Close",1);
+    namedWindow("Erode/Dilate",1);
+
+    open_close_pos = erode_dilate_pos = max_iters;
+    createTrackbar("iterations", "Open/Close",&open_close_pos,max_iters*2+1,OpenClose);
+    createTrackbar("iterations", "Erode/Dilate",&erode_dilate_pos,max_iters*2+1,ErodeDilate);
+
+    for(;;)
+    {
+        int c;
+
+        OpenClose(open_close_pos, 0);
+        ErodeDilate(erode_dilate_pos, 0);
+        c = cvWaitKey(0);
+
+        if( (char)c == 27 )
+            break;
+        if( (char)c == 'e' )
+            element_shape = MORPH_ELLIPSE;
+        else if( (char)c == 'r' )
+            element_shape = MORPH_RECT;
+        else if( (char)c == 'c' )
+            element_shape = MORPH_CROSS;
+        else if( (char)c == ' ' )
+            element_shape = (element_shape + 1) % 3;
+    }
+
+    return 0;
+}
--- a/samples/gpu/multi.cpp
+++ b/samples/gpu/multi.cpp
@@ -1,98 +1,98 @@
-/* This sample demonstrates the way you can perform independed tasks 
-   on the different GPUs */
-
-// Disable some warnings which are caused with CUDA headers
-#if defined(_MSC_VER)
-#pragma warning(disable: 4201 4408 4100)
-#endif
-
-#include <iostream>
-#include "cvconfig.h"
-#include "opencv2/core/core.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
-
-int main()
-{
-#if !defined(HAVE_CUDA)
-    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
-#endif
-
-#if !defined(HAVE_TBB)
-    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
-#endif
-
-    return 0;
-}
-
-#else
-
-#include "opencv2/core/internal.hpp" // For TBB wrappers
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-struct Worker { void operator()(int device_id) const; };
-
-int main()
-{
-    int num_devices = getCudaEnabledDeviceCount();
-    if (num_devices < 2)
-    {
-        std::cout << "Two or more GPUs are required\n";
-        return -1;
-    }
-    for (int i = 0; i < num_devices; ++i)
-    {
-        cv::gpu::printShortCudaDeviceInfo(i);
-
-        DeviceInfo dev_info(i);
-        if (!dev_info.isCompatible())
-        {
-            std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.majorVersion()
-                 << dev_info.minorVersion() << "\n";
-            return -1;
-        }
-    }
-
-    // Execute calculation in two threads using two GPUs
-    int devices[] = {0, 1};
-    parallel_do(devices, devices + 2, Worker());
-
-    return 0;
-}
-
-
-void Worker::operator()(int device_id) const
-{
-    setDevice(device_id);
-
-    Mat src(1000, 1000, CV_32F);
-    Mat dst;
-
-    RNG rng(0);
-    rng.fill(src, RNG::UNIFORM, 0, 1);
-
-    // CPU works
-    transpose(src, dst);
-
-    // GPU works
-    GpuMat d_src(src);
-    GpuMat d_dst;
-    transpose(d_src, d_dst);
-
-    // Check results
-    bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
-    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
-        << (passed ? "passed" : "FAILED") << endl;
-
-    // Deallocate data here, otherwise deallocation will be performed
-    // after context is extracted from the stack
-    d_src.release();
-    d_dst.release();
-}
-
-#endif
+/* This sample demonstrates the way you can perform independed tasks
+   on the different GPUs */
+
+// Disable some warnings which are caused with CUDA headers
+#if defined(_MSC_VER)
+#pragma warning(disable: 4201 4408 4100)
+#endif
+
+#include <iostream>
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+
+int main()
+{
+#if !defined(HAVE_CUDA)
+    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
+#endif
+
+#if !defined(HAVE_TBB)
+    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
+#endif
+
+    return 0;
+}
+
+#else
+
+#include "opencv2/core/internal.hpp" // For TBB wrappers
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+struct Worker { void operator()(int device_id) const; };
+
+int main()
+{
+    int num_devices = getCudaEnabledDeviceCount();
+    if (num_devices < 2)
+    {
+        std::cout << "Two or more GPUs are required\n";
+        return -1;
+    }
+    for (int i = 0; i < num_devices; ++i)
+    {
+        cv::gpu::printShortCudaDeviceInfo(i);
+
+        DeviceInfo dev_info(i);
+        if (!dev_info.isCompatible())
+        {
+            std::cout << "GPU module isn't built for GPU #" << i << " ("
+                 << dev_info.name() << ", CC " << dev_info.majorVersion()
+                 << dev_info.minorVersion() << "\n";
+            return -1;
+        }
+    }
+
+    // Execute calculation in two threads using two GPUs
+    int devices[] = {0, 1};
+    parallel_do(devices, devices + 2, Worker());
+
+    return 0;
+}
+
+
+void Worker::operator()(int device_id) const
+{
+    setDevice(device_id);
+
+    Mat src(1000, 1000, CV_32F);
+    Mat dst;
+
+    RNG rng(0);
+    rng.fill(src, RNG::UNIFORM, 0, 1);
+
+    // CPU works
+    transpose(src, dst);
+
+    // GPU works
+    GpuMat d_src(src);
+    GpuMat d_dst;
+    transpose(d_src, d_dst);
+
+    // Check results
+    bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
+    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
+        << (passed ? "passed" : "FAILED") << endl;
+
+    // Deallocate data here, otherwise deallocation will be performed
+    // after context is extracted from the stack
+    d_src.release();
+    d_dst.release();
+}
+
+#endif
--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
--- a/samples/gpu/performance/CMakeLists.txt
+++ b/samples/gpu/performance/CMakeLists.txt
@@ -1,26 +1,26 @@
-set(the_target "example_gpu_performance")
-
-file(GLOB sources "performance/*.cpp")
-file(GLOB headers "performance/*.h")
-
-add_executable(${the_target} ${sources} ${headers})
-target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
-
-set_target_properties(${the_target} PROPERTIES
-    OUTPUT_NAME "performance_gpu"
-    PROJECT_LABEL "(EXAMPLE_GPU) performance")
-
-if(ENABLE_SOLUTION_FOLDERS)
-  set_target_properties(${the_target} PROPERTIES FOLDER "samples//gpu")
-endif()
-
-if(WIN32)
-  install(TARGETS ${the_target} RUNTIME DESTINATION "samples/gpu" COMPONENT main)
-endif()
-
-if(INSTALL_C_EXAMPLES AND NOT WIN32)
-  file(GLOB GPU_FILES performance/*.cpp performance/*.h)
-  install(FILES ${GPU_FILES}
-          DESTINATION share/OpenCV/samples/gpu/performance
-          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
-endif()
+set(the_target "example_gpu_performance")
+
+file(GLOB sources "performance/*.cpp")
+file(GLOB headers "performance/*.h")
+
+add_executable(${the_target} ${sources} ${headers})
+target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
+
+set_target_properties(${the_target} PROPERTIES
+    OUTPUT_NAME "performance_gpu"
+    PROJECT_LABEL "(EXAMPLE_GPU) performance")
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(${the_target} PROPERTIES FOLDER "samples//gpu")
+endif()
+
+if(WIN32)
+  install(TARGETS ${the_target} RUNTIME DESTINATION "samples/gpu" COMPONENT main)
+endif()
+
+if(INSTALL_C_EXAMPLES AND NOT WIN32)
+  file(GLOB GPU_FILES performance/*.cpp performance/*.h)
+  install(FILES ${GPU_FILES}
+          DESTINATION share/OpenCV/samples/gpu/performance
+          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
+endif()
--- a/samples/gpu/performance/performance.cpp
+++ b/samples/gpu/performance/performance.cpp
@@ -1,226 +1,226 @@
-#include <iomanip>
-#include <stdexcept>
-#include <string>
-#include "performance.h"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-void TestSystem::run()
-{
-    if (is_list_mode_)
-    {
-        for (vector<Runnable*>::iterator it = tests_.begin(); it != tests_.end(); ++it)
-            cout << (*it)->name() << endl;
-
-        return;
-    }
-
-    // Run test initializers
-    for (vector<Runnable*>::iterator it = inits_.begin(); it != inits_.end(); ++it)
-    {
-        if ((*it)->name().find(test_filter_, 0) != string::npos)
-            (*it)->run();
-    }
-
-    printHeading();
-
-    // Run tests
-    for (vector<Runnable*>::iterator it = tests_.begin(); it != tests_.end(); ++it)
-    {
-        try
-        {
-            if ((*it)->name().find(test_filter_, 0) != string::npos)
-            {
-                cout << endl << (*it)->name() << ":\n";
-                (*it)->run();
-                finishCurrentSubtest();
-            }
-        }
-        catch (const Exception&)
-        {
-            // Message is printed via callback
-            resetCurrentSubtest();
-        }
-        catch (const runtime_error& e)
-        {
-            printError(e.what());
-            resetCurrentSubtest();
-        }
-    }
-
-    printSummary();
-}
-
-
-void TestSystem::finishCurrentSubtest()
-{
-    if (cur_subtest_is_empty_)
-        // There is no need to print subtest statistics
-        return;
-
-    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
-    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
-
-    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
-    speedup_total_ += speedup;
-
-    printMetrics(cpu_time, gpu_time, speedup);
-
-    num_subtests_called_++;
-    resetCurrentSubtest();
-}
-
-
-double TestSystem::meanTime(const vector<int64> &samples)
-{
-    double sum = accumulate(samples.begin(), samples.end(), 0.);
-    if (samples.size() > 1)
-        return (sum - samples[0]) / (samples.size() - 1);
-    return sum;
-}
-
-
-void TestSystem::printHeading()
-{
-    cout << endl;
-    cout << setiosflags(ios_base::left);
-    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
-        << setw(14) << "SPEEDUP"
-        << "DESCRIPTION\n";
-    cout << resetiosflags(ios_base::left);
-}
-
-
-void TestSystem::printSummary()
-{
-    cout << setiosflags(ios_base::fixed);
-    cout << "\naverage GPU speedup: x"
-        << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
-        << endl;
-    cout << resetiosflags(ios_base::fixed);
-}
-
-
-void TestSystem::printMetrics(double cpu_time, double gpu_time, double speedup)
-{
-    cout << TAB << setiosflags(ios_base::left);
-    stringstream stream;
-
-    stream << cpu_time;
-    cout << setw(10) << stream.str();
-
-    stream.str("");
-    stream << gpu_time;
-    cout << setw(10) << stream.str();
-
-    stream.str("");
-    stream << "x" << setprecision(3) << speedup;
-    cout << setw(14) << stream.str();
-
-    cout << cur_subtest_description_.str();
-    cout << resetiosflags(ios_base::left) << endl;
-}
-
-
-void TestSystem::printError(const std::string& msg)
-{
-    cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
-}
-
-
-void gen(Mat& mat, int rows, int cols, int type, Scalar low, Scalar high)
-{
-    mat.create(rows, cols, type);
-    RNG rng(0);
-    rng.fill(mat, RNG::UNIFORM, low, high);
-}
-
-
-string abspath(const string& relpath)
-{
-    return TestSystem::instance().workingDir() + relpath;
-}
-
-
-static int CV_CDECL cvErrorCallback(int /*status*/, const char* /*func_name*/,
-                             const char* err_msg, const char* /*file_name*/,
-                             int /*line*/, void* /*userdata*/)
-{
-    TestSystem::instance().printError(err_msg);
-    return 0;
-}
-
-
-int main(int argc, const char* argv[])
-{
-    int num_devices = getCudaEnabledDeviceCount();
-    if (num_devices == 0)
-    {
-        cerr << "No GPU found or the library was compiled without GPU support";
-        return -1;
-    }
-
-    redirectError(cvErrorCallback);
-
-    const char* keys =
-       "{ h  help    |       | print help message }"
-       "{ f  filter  |       | filter for test }"
-       "{ w  workdir |       | set working directory }"
-       "{ l  list    |       | show all tests }"
-       "{ d  device  | 0     | device id }"
-       "{ i  iters   | 10    | iteration count }";
-
-    CommandLineParser cmd(argc, argv, keys);
-
-    if (cmd.has("help") || !cmd.check())
-    {
-        cmd.printMessage();
-        cmd.printErrors();
-        return 0;
-    }
-
-
-    int device = cmd.get<int>("device");
-    if (device < 0 || device >= num_devices)
-    {
-        cerr << "Invalid device ID" << endl;
-        return -1;
-    }
-    DeviceInfo dev_info(device);
-    if (!dev_info.isCompatible())
-    {
-        cerr << "GPU module isn't built for GPU #" << device << " " << dev_info.name() << ", CC " << dev_info.majorVersion() << '.' << dev_info.minorVersion() << endl;
-        return -1;
-    }
-    setDevice(device);
-    printShortCudaDeviceInfo(device);
-
-    string filter = cmd.get<string>("filter");
-    string workdir = cmd.get<string>("workdir");
-    bool list = cmd.has("list");
-    int iters = cmd.get<int>("iters");
-
-    if (!filter.empty())
-        TestSystem::instance().setTestFilter(filter);
-
-    if (!workdir.empty())
-    {
-        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
-            workdir += '/';
-
-        TestSystem::instance().setWorkingDir(workdir);
-    }
-
-    if (list)
-        TestSystem::instance().setListMode(true);
-
-    TestSystem::instance().setNumIters(iters);
-
-    cout << "\nNote: the timings for GPU don't include data transfer" << endl;
-
-    TestSystem::instance().run();
-
-    return 0;
-}
+#include <iomanip>
+#include <stdexcept>
+#include <string>
+#include "performance.h"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+void TestSystem::run()
+{
+    if (is_list_mode_)
+    {
+        for (vector<Runnable*>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+            cout << (*it)->name() << endl;
+
+        return;
+    }
+
+    // Run test initializers
+    for (vector<Runnable*>::iterator it = inits_.begin(); it != inits_.end(); ++it)
+    {
+        if ((*it)->name().find(test_filter_, 0) != string::npos)
+            (*it)->run();
+    }
+
+    printHeading();
+
+    // Run tests
+    for (vector<Runnable*>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+    {
+        try
+        {
+            if ((*it)->name().find(test_filter_, 0) != string::npos)
+            {
+                cout << endl << (*it)->name() << ":\n";
+                (*it)->run();
+                finishCurrentSubtest();
+            }
+        }
+        catch (const Exception&)
+        {
+            // Message is printed via callback
+            resetCurrentSubtest();
+        }
+        catch (const runtime_error& e)
+        {
+            printError(e.what());
+            resetCurrentSubtest();
+        }
+    }
+
+    printSummary();
+}
+
+
+void TestSystem::finishCurrentSubtest()
+{
+    if (cur_subtest_is_empty_)
+        // There is no need to print subtest statistics
+        return;
+
+    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
+    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
+
+    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
+    speedup_total_ += speedup;
+
+    printMetrics(cpu_time, gpu_time, speedup);
+
+    num_subtests_called_++;
+    resetCurrentSubtest();
+}
+
+
+double TestSystem::meanTime(const vector<int64> &samples)
+{
+    double sum = accumulate(samples.begin(), samples.end(), 0.);
+    if (samples.size() > 1)
+        return (sum - samples[0]) / (samples.size() - 1);
+    return sum;
+}
+
+
+void TestSystem::printHeading()
+{
+    cout << endl;
+    cout << setiosflags(ios_base::left);
+    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
+        << setw(14) << "SPEEDUP"
+        << "DESCRIPTION\n";
+    cout << resetiosflags(ios_base::left);
+}
+
+
+void TestSystem::printSummary()
+{
+    cout << setiosflags(ios_base::fixed);
+    cout << "\naverage GPU speedup: x"
+        << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
+        << endl;
+    cout << resetiosflags(ios_base::fixed);
+}
+
+
+void TestSystem::printMetrics(double cpu_time, double gpu_time, double speedup)
+{
+    cout << TAB << setiosflags(ios_base::left);
+    stringstream stream;
+
+    stream << cpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << gpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << "x" << setprecision(3) << speedup;
+    cout << setw(14) << stream.str();
+
+    cout << cur_subtest_description_.str();
+    cout << resetiosflags(ios_base::left) << endl;
+}
+
+
+void TestSystem::printError(const std::string& msg)
+{
+    cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
+}
+
+
+void gen(Mat& mat, int rows, int cols, int type, Scalar low, Scalar high)
+{
+    mat.create(rows, cols, type);
+    RNG rng(0);
+    rng.fill(mat, RNG::UNIFORM, low, high);
+}
+
+
+string abspath(const string& relpath)
+{
+    return TestSystem::instance().workingDir() + relpath;
+}
+
+
+static int CV_CDECL cvErrorCallback(int /*status*/, const char* /*func_name*/,
+                             const char* err_msg, const char* /*file_name*/,
+                             int /*line*/, void* /*userdata*/)
+{
+    TestSystem::instance().printError(err_msg);
+    return 0;
+}
+
+
+int main(int argc, const char* argv[])
+{
+    int num_devices = getCudaEnabledDeviceCount();
+    if (num_devices == 0)
+    {
+        cerr << "No GPU found or the library was compiled without GPU support";
+        return -1;
+    }
+
+    redirectError(cvErrorCallback);
+
+    const char* keys =
+       "{ h  help    |       | print help message }"
+       "{ f  filter  |       | filter for test }"
+       "{ w  workdir |       | set working directory }"
+       "{ l  list    |       | show all tests }"
+       "{ d  device  | 0     | device id }"
+       "{ i  iters   | 10    | iteration count }";
+
+    CommandLineParser cmd(argc, argv, keys);
+
+    if (cmd.has("help") || !cmd.check())
+    {
+        cmd.printMessage();
+        cmd.printErrors();
+        return 0;
+    }
+
+
+    int device = cmd.get<int>("device");
+    if (device < 0 || device >= num_devices)
+    {
+        cerr << "Invalid device ID" << endl;
+        return -1;
+    }
+    DeviceInfo dev_info(device);
+    if (!dev_info.isCompatible())
+    {
+        cerr << "GPU module isn't built for GPU #" << device << " " << dev_info.name() << ", CC " << dev_info.majorVersion() << '.' << dev_info.minorVersion() << endl;
+        return -1;
+    }
+    setDevice(device);
+    printShortCudaDeviceInfo(device);
+
+    string filter = cmd.get<string>("filter");
+    string workdir = cmd.get<string>("workdir");
+    bool list = cmd.has("list");
+    int iters = cmd.get<int>("iters");
+
+    if (!filter.empty())
+        TestSystem::instance().setTestFilter(filter);
+
+    if (!workdir.empty())
+    {
+        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
+            workdir += '/';
+
+        TestSystem::instance().setWorkingDir(workdir);
+    }
+
+    if (list)
+        TestSystem::instance().setListMode(true);
+
+    TestSystem::instance().setNumIters(iters);
+
+    cout << "\nNote: the timings for GPU don't include data transfer" << endl;
+
+    TestSystem::instance().run();
+
+    return 0;
+}
--- a/samples/gpu/performance/performance.h
+++ b/samples/gpu/performance/performance.h
@@ -1,189 +1,189 @@
-#ifndef OPENCV_GPU_SAMPLE_PERFORMANCE_H_
-#define OPENCV_GPU_SAMPLE_PERFORMANCE_H_
-
-#include <iostream>
-#include <cstdio>
-#include <vector>
-#include <numeric>
-#include <string>
-#include "opencv2/core/core.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-#define TAB "    "
-
-class Runnable
-{
-public:
-    explicit Runnable(const std::string& nameStr): name_(nameStr) {}
-    virtual ~Runnable() {}
-
-    const std::string& name() const { return name_; }
-
-    virtual void run() = 0;
-
-private:
-    std::string name_;
-};
-
-
-class TestSystem
-{
-public:
-    static TestSystem& instance()
-    {
-        static TestSystem me;
-        return me;
-    }
-
-    void setWorkingDir(const std::string& val) { working_dir_ = val; }
-    const std::string& workingDir() const { return working_dir_; }
-
-    void setTestFilter(const std::string& val) { test_filter_ = val; }
-    const std::string& testFilter() const { return test_filter_; }
-
-    void setNumIters(int num_iters) { num_iters_ = num_iters; }
-
-    void addInit(Runnable* init) { inits_.push_back(init); }
-    void addTest(Runnable* test) { tests_.push_back(test); }
-    void run();
-
-    // It's public because OpenCV callback uses it
-    void printError(const std::string& msg);
-
-    std::stringstream& startNewSubtest()
-    {
-        finishCurrentSubtest();
-        return cur_subtest_description_;
-    }
-
-    bool stop() const { return cur_iter_idx_ >= num_iters_; }
-
-    void cpuOn() { cpu_started_ = cv::getTickCount(); }
-    void cpuOff()
-    {
-        int64 delta = cv::getTickCount() - cpu_started_;
-        cpu_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void cpuComplete()
-    {
-        cpu_elapsed_ += meanTime(cpu_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    void gpuOn() { gpu_started_ = cv::getTickCount(); }
-    void gpuOff()
-    {
-        int64 delta = cv::getTickCount() - gpu_started_;
-        gpu_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void gpuComplete()
-    {
-        gpu_elapsed_ += meanTime(gpu_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    bool isListMode() const { return is_list_mode_; }
-    void setListMode(bool value) { is_list_mode_ = value; }
-
-private:
-    TestSystem():
-            cur_subtest_is_empty_(true), cpu_elapsed_(0),
-            gpu_elapsed_(0), speedup_total_(0.0),
-            num_subtests_called_(0), is_list_mode_(false),
-            num_iters_(10), cur_iter_idx_(0)
-    {
-        cpu_times_.reserve(num_iters_);
-        gpu_times_.reserve(num_iters_);
-    }
-
-    void finishCurrentSubtest();
-    void resetCurrentSubtest()
-    {
-        cpu_elapsed_ = 0;
-        gpu_elapsed_ = 0;
-        cur_subtest_description_.str("");
-        cur_subtest_is_empty_ = true;
-        cur_iter_idx_ = 0;
-        cpu_times_.clear();
-        gpu_times_.clear();
-    }
-
-    double meanTime(const std::vector<int64> &samples);
-
-    void printHeading();
-    void printSummary();
-    void printMetrics(double cpu_time, double gpu_time, double speedup);
-
-    std::string working_dir_;
-    std::string test_filter_;
-
-    std::vector<Runnable*> inits_;
-    std::vector<Runnable*> tests_;
-
-    std::stringstream cur_subtest_description_;
-    bool cur_subtest_is_empty_;
-
-    int64 cpu_started_;
-    int64 gpu_started_;
-    double cpu_elapsed_;
-    double gpu_elapsed_;
-
-    double speedup_total_;
-    int num_subtests_called_;
-
-    bool is_list_mode_;
-
-    int num_iters_;
-    int cur_iter_idx_;
-    std::vector<int64> cpu_times_;
-    std::vector<int64> gpu_times_;
-};
-
-
-#define GLOBAL_INIT(name) \
-    struct name##_init: Runnable { \
-        name##_init(): Runnable(#name) { \
-            TestSystem::instance().addInit(this); \
-        } \
-        void run(); \
-    } name##_init_instance; \
-    void name##_init::run()
-
-
-#define TEST(name) \
-    struct name##_test: Runnable { \
-        name##_test(): Runnable(#name) { \
-            TestSystem::instance().addTest(this); \
-        } \
-        void run(); \
-    } name##_test_instance; \
-    void name##_test::run()
-
-#define SUBTEST TestSystem::instance().startNewSubtest()
-
-#define CPU_ON \
-    while (!TestSystem::instance().stop()) { \
-        TestSystem::instance().cpuOn()
-#define CPU_OFF \
-        TestSystem::instance().cpuOff(); \
-    } TestSystem::instance().cpuComplete()
-
-#define GPU_ON \
-    while (!TestSystem::instance().stop()) { \
-        TestSystem::instance().gpuOn()
-#define GPU_OFF \
-        TestSystem::instance().gpuOff(); \
-    } TestSystem::instance().gpuComplete()
-
-// Generates a matrix
-void gen(cv::Mat& mat, int rows, int cols, int type, cv::Scalar low,
-         cv::Scalar high);
-
-// Returns abs path taking into account test system working dir
-std::string abspath(const std::string& relpath);
-
-#endif // OPENCV_GPU_SAMPLE_PERFORMANCE_H_
+#ifndef OPENCV_GPU_SAMPLE_PERFORMANCE_H_
+#define OPENCV_GPU_SAMPLE_PERFORMANCE_H_
+
+#include <iostream>
+#include <cstdio>
+#include <vector>
+#include <numeric>
+#include <string>
+#include "opencv2/core/core.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+#define TAB "    "
+
+class Runnable
+{
+public:
+    explicit Runnable(const std::string& nameStr): name_(nameStr) {}
+    virtual ~Runnable() {}
+
+    const std::string& name() const { return name_; }
+
+    virtual void run() = 0;
+
+private:
+    std::string name_;
+};
+
+
+class TestSystem
+{
+public:
+    static TestSystem& instance()
+    {
+        static TestSystem me;
+        return me;
+    }
+
+    void setWorkingDir(const std::string& val) { working_dir_ = val; }
+    const std::string& workingDir() const { return working_dir_; }
+
+    void setTestFilter(const std::string& val) { test_filter_ = val; }
+    const std::string& testFilter() const { return test_filter_; }
+
+    void setNumIters(int num_iters) { num_iters_ = num_iters; }
+
+    void addInit(Runnable* init) { inits_.push_back(init); }
+    void addTest(Runnable* test) { tests_.push_back(test); }
+    void run();
+
+    // It's public because OpenCV callback uses it
+    void printError(const std::string& msg);
+
+    std::stringstream& startNewSubtest()
+    {
+        finishCurrentSubtest();
+        return cur_subtest_description_;
+    }
+
+    bool stop() const { return cur_iter_idx_ >= num_iters_; }
+
+    void cpuOn() { cpu_started_ = cv::getTickCount(); }
+    void cpuOff()
+    {
+        int64 delta = cv::getTickCount() - cpu_started_;
+        cpu_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void cpuComplete()
+    {
+        cpu_elapsed_ += meanTime(cpu_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    void gpuOn() { gpu_started_ = cv::getTickCount(); }
+    void gpuOff()
+    {
+        int64 delta = cv::getTickCount() - gpu_started_;
+        gpu_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void gpuComplete()
+    {
+        gpu_elapsed_ += meanTime(gpu_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    bool isListMode() const { return is_list_mode_; }
+    void setListMode(bool value) { is_list_mode_ = value; }
+
+private:
+    TestSystem():
+            cur_subtest_is_empty_(true), cpu_elapsed_(0),
+            gpu_elapsed_(0), speedup_total_(0.0),
+            num_subtests_called_(0), is_list_mode_(false),
+            num_iters_(10), cur_iter_idx_(0)
+    {
+        cpu_times_.reserve(num_iters_);
+        gpu_times_.reserve(num_iters_);
+    }
+
+    void finishCurrentSubtest();
+    void resetCurrentSubtest()
+    {
+        cpu_elapsed_ = 0;
+        gpu_elapsed_ = 0;
+        cur_subtest_description_.str("");
+        cur_subtest_is_empty_ = true;
+        cur_iter_idx_ = 0;
+        cpu_times_.clear();
+        gpu_times_.clear();
+    }
+
+    double meanTime(const std::vector<int64> &samples);
+
+    void printHeading();
+    void printSummary();
+    void printMetrics(double cpu_time, double gpu_time, double speedup);
+
+    std::string working_dir_;
+    std::string test_filter_;
+
+    std::vector<Runnable*> inits_;
+    std::vector<Runnable*> tests_;
+
+    std::stringstream cur_subtest_description_;
+    bool cur_subtest_is_empty_;
+
+    int64 cpu_started_;
+    int64 gpu_started_;
+    double cpu_elapsed_;
+    double gpu_elapsed_;
+
+    double speedup_total_;
+    int num_subtests_called_;
+
+    bool is_list_mode_;
+
+    int num_iters_;
+    int cur_iter_idx_;
+    std::vector<int64> cpu_times_;
+    std::vector<int64> gpu_times_;
+};
+
+
+#define GLOBAL_INIT(name) \
+    struct name##_init: Runnable { \
+        name##_init(): Runnable(#name) { \
+            TestSystem::instance().addInit(this); \
+        } \
+        void run(); \
+    } name##_init_instance; \
+    void name##_init::run()
+
+
+#define TEST(name) \
+    struct name##_test: Runnable { \
+        name##_test(): Runnable(#name) { \
+            TestSystem::instance().addTest(this); \
+        } \
+        void run(); \
+    } name##_test_instance; \
+    void name##_test::run()
+
+#define SUBTEST TestSystem::instance().startNewSubtest()
+
+#define CPU_ON \
+    while (!TestSystem::instance().stop()) { \
+        TestSystem::instance().cpuOn()
+#define CPU_OFF \
+        TestSystem::instance().cpuOff(); \
+    } TestSystem::instance().cpuComplete()
+
+#define GPU_ON \
+    while (!TestSystem::instance().stop()) { \
+        TestSystem::instance().gpuOn()
+#define GPU_OFF \
+        TestSystem::instance().gpuOff(); \
+    } TestSystem::instance().gpuComplete()
+
+// Generates a matrix
+void gen(cv::Mat& mat, int rows, int cols, int type, cv::Scalar low,
+         cv::Scalar high);
+
+// Returns abs path taking into account test system working dir
+std::string abspath(const std::string& relpath);
+
+#endif // OPENCV_GPU_SAMPLE_PERFORMANCE_H_
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
--- a/samples/gpu/pyrlk_optical_flow.cpp
+++ b/samples/gpu/pyrlk_optical_flow.cpp
@@ -1,291 +1,291 @@
-#include <iostream>
-#include <vector>
-
-#include "cvconfig.h"
-#include "opencv2/core/core.hpp"
-#include "opencv2/core/opengl_interop.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/video/video.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-static void download(const GpuMat& d_mat, vector<Point2f>& vec)
-{
-    vec.resize(d_mat.cols);
-    Mat mat(1, d_mat.cols, CV_32FC2, (void*)&vec[0]);
-    d_mat.download(mat);
-}
-
-static void download(const GpuMat& d_mat, vector<uchar>& vec)
-{
-    vec.resize(d_mat.cols);
-    Mat mat(1, d_mat.cols, CV_8UC1, (void*)&vec[0]);
-    d_mat.download(mat);
-}
-
-static void drawArrows(Mat& frame, const vector<Point2f>& prevPts, const vector<Point2f>& nextPts, const vector<uchar>& status, Scalar line_color = Scalar(0, 0, 255))
-{
-    for (size_t i = 0; i < prevPts.size(); ++i)
-    {
-        if (status[i])
-        {
-            int line_thickness = 1;
-
-            Point p = prevPts[i];
-            Point q = nextPts[i];
-
-            double angle = atan2((double) p.y - q.y, (double) p.x - q.x);
-
-            double hypotenuse = sqrt( (double)(p.y - q.y)*(p.y - q.y) + (double)(p.x - q.x)*(p.x - q.x) );
-
-            if (hypotenuse < 1.0)
-                continue;
-
-            // Here we lengthen the arrow by a factor of three.
-            q.x = (int) (p.x - 3 * hypotenuse * cos(angle));
-            q.y = (int) (p.y - 3 * hypotenuse * sin(angle));
-
-            // Now we draw the main line of the arrow.
-            line(frame, p, q, line_color, line_thickness);
-
-            // Now draw the tips of the arrow. I do some scaling so that the
-            // tips look proportional to the main line of the arrow.
-
-            p.x = (int) (q.x + 9 * cos(angle + CV_PI / 4));
-            p.y = (int) (q.y + 9 * sin(angle + CV_PI / 4));
-            line(frame, p, q, line_color, line_thickness);
-
-            p.x = (int) (q.x + 9 * cos(angle - CV_PI / 4));
-            p.y = (int) (q.y + 9 * sin(angle - CV_PI / 4));
-            line(frame, p, q, line_color, line_thickness);
-        }
-    }
-}
-
-#ifdef HAVE_OPENGL
-
-struct DrawData
-{
-    GlTexture tex;
-    GlArrays arr;
-};
-
-static void drawCallback(void* userdata)
-{
-    DrawData* data = static_cast<DrawData*>(userdata);
-
-    if (data->tex.empty() || data->arr.empty())
-        return;
-
-    static GlCamera camera;
-    static bool init_camera = true;
-
-    if (init_camera)
-    {
-        camera.setOrthoProjection(0.0, 1.0, 1.0, 0.0, 0.0, 1.0);
-        camera.lookAt(Point3d(0.0, 0.0, 1.0), Point3d(0.0, 0.0, 0.0), Point3d(0.0, 1.0, 0.0));
-        init_camera = false;
-    }
-
-    camera.setupProjectionMatrix();
-    camera.setupModelViewMatrix();
-
-    render(data->tex);
-    render(data->arr, RenderMode::TRIANGLES);
-}
-
-#endif
-
-template <typename T> inline T clamp (T x, T a, T b)
-{
-    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
-}
-
-template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
-{
-    x = clamp(x, a, b);
-    return c + (d - c) * (x - a) / (b - a);
-}
-
-static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
-{
-    float maxDisplacement = 1.0f;
-
-    for (int i = 0; i < u.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-        for (int j = 0; j < u.cols; ++j)
-        {
-            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
-
-            if (d > maxDisplacement)
-                maxDisplacement = d;
-        }
-    }
-
-    flowField.create(u.size(), CV_8UC4);
-
-    for (int i = 0; i < flowField.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-
-        Vec4b* row = flowField.ptr<Vec4b>(i);
-
-        for (int j = 0; j < flowField.cols; ++j)
-        {
-            row[j][0] = 0;
-            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][3] = 255;
-        }
-    }
-}
-
-int main(int argc, const char* argv[])
-{
-    const char* keys =
-        "{ h             help   |       | print help message }"
-        "{ l             left   |       | specify left image }"
-        "{ r             right  |       | specify right image }"
-        "{ gray                 |       | use grayscale sources [PyrLK Sparse] }"
-        "{ win_size             | 21    | specify windows size [PyrLK] }"
-        "{ max_level            | 3     | specify max level [PyrLK] }"
-        "{ iters                | 30    | specify iterations count [PyrLK] }"
-        "{ points               | 4000  | specify points count [GoodFeatureToTrack] }"
-        "{ min_dist             | 0     | specify minimal distance between points [GoodFeatureToTrack] }";
-
-    CommandLineParser cmd(argc, argv, keys);
-
-    if (cmd.has("help") || !cmd.check())
-    {
-        cmd.printMessage();
-        cmd.printErrors();
-        return 0;
-    }
-
-    string fname0 = cmd.get<string>("left");
-    string fname1 = cmd.get<string>("right");
-
-    if (fname0.empty() || fname1.empty())
-    {
-        cerr << "Missing input file names" << endl;
-        return -1;
-    }
-
-    bool useGray = cmd.has("gray");
-    int winSize = cmd.get<int>("win_size");
-    int maxLevel = cmd.get<int>("max_level");
-    int iters = cmd.get<int>("iters");
-    int points = cmd.get<int>("points");
-    double minDist = cmd.get<double>("min_dist");
-
-    Mat frame0 = imread(fname0);
-    Mat frame1 = imread(fname1);
-
-    if (frame0.empty() || frame1.empty())
-    {
-        cout << "Can't load input images" << endl;
-        return -1;
-    }
-
-    namedWindow("PyrLK [Sparse]", WINDOW_NORMAL);
-    namedWindow("PyrLK [Dense] Flow Field", WINDOW_NORMAL);
-
-    #ifdef HAVE_OPENGL
-        namedWindow("PyrLK [Dense]", WINDOW_OPENGL);
-
-        setGlDevice();
-    #endif
-
-    cout << "Image size : " << frame0.cols << " x " << frame0.rows << endl;
-    cout << "Points count : " << points << endl;
-
-    cout << endl;
-
-    Mat frame0Gray;
-    cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
-    Mat frame1Gray;
-    cvtColor(frame1, frame1Gray, COLOR_BGR2GRAY);
-
-    // goodFeaturesToTrack
-
-    GoodFeaturesToTrackDetector_GPU detector(points, 0.01, minDist);
-
-    GpuMat d_frame0Gray(frame0Gray);
-    GpuMat d_prevPts;
-
-    detector(d_frame0Gray, d_prevPts);
-
-    // Sparse
-
-    PyrLKOpticalFlow d_pyrLK;
-
-    d_pyrLK.winSize.width = winSize;
-    d_pyrLK.winSize.height = winSize;
-    d_pyrLK.maxLevel = maxLevel;
-    d_pyrLK.iters = iters;
-
-    GpuMat d_frame0(frame0);
-    GpuMat d_frame1(frame1);
-    GpuMat d_frame1Gray(frame1Gray);
-    GpuMat d_nextPts;
-    GpuMat d_status;
-
-    d_pyrLK.sparse(useGray ? d_frame0Gray : d_frame0, useGray ? d_frame1Gray : d_frame1, d_prevPts, d_nextPts, d_status);
-
-    // Draw arrows
-
-    vector<Point2f> prevPts(d_prevPts.cols);
-    download(d_prevPts, prevPts);
-
-    vector<Point2f> nextPts(d_nextPts.cols);
-    download(d_nextPts, nextPts);
-
-    vector<uchar> status(d_status.cols);
-    download(d_status, status);
-
-    drawArrows(frame0, prevPts, nextPts, status, Scalar(255, 0, 0));
-
-    imshow("PyrLK [Sparse]", frame0);
-
-    // Dense
-
-    GpuMat d_u;
-    GpuMat d_v;
-
-    d_pyrLK.dense(d_frame0Gray, d_frame1Gray, d_u, d_v);
-
-    // Draw flow field
-
-    Mat flowField;
-    getFlowField(Mat(d_u), Mat(d_v), flowField);
-
-    imshow("PyrLK [Dense] Flow Field", flowField);
-
-    #ifdef HAVE_OPENGL
-        setOpenGlContext("PyrLK [Dense]");
-
-        GpuMat d_vertex, d_colors;
-        createOpticalFlowNeedleMap(d_u, d_v, d_vertex, d_colors);
-
-        DrawData drawData;
-
-        drawData.tex.copyFrom(d_frame0Gray);
-        drawData.arr.setVertexArray(d_vertex);
-        drawData.arr.setColorArray(d_colors, false);
-
-        setOpenGlDrawCallback("PyrLK [Dense]", drawCallback, &drawData);
-    #endif
-
-    waitKey();
-
-    return 0;
-}
+#include <iostream>
+#include <vector>
+
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/opengl_interop.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/video/video.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+static void download(const GpuMat& d_mat, vector<Point2f>& vec)
+{
+    vec.resize(d_mat.cols);
+    Mat mat(1, d_mat.cols, CV_32FC2, (void*)&vec[0]);
+    d_mat.download(mat);
+}
+
+static void download(const GpuMat& d_mat, vector<uchar>& vec)
+{
+    vec.resize(d_mat.cols);
+    Mat mat(1, d_mat.cols, CV_8UC1, (void*)&vec[0]);
+    d_mat.download(mat);
+}
+
+static void drawArrows(Mat& frame, const vector<Point2f>& prevPts, const vector<Point2f>& nextPts, const vector<uchar>& status, Scalar line_color = Scalar(0, 0, 255))
+{
+    for (size_t i = 0; i < prevPts.size(); ++i)
+    {
+        if (status[i])
+        {
+            int line_thickness = 1;
+
+            Point p = prevPts[i];
+            Point q = nextPts[i];
+
+            double angle = atan2((double) p.y - q.y, (double) p.x - q.x);
+
+            double hypotenuse = sqrt( (double)(p.y - q.y)*(p.y - q.y) + (double)(p.x - q.x)*(p.x - q.x) );
+
+            if (hypotenuse < 1.0)
+                continue;
+
+            // Here we lengthen the arrow by a factor of three.
+            q.x = (int) (p.x - 3 * hypotenuse * cos(angle));
+            q.y = (int) (p.y - 3 * hypotenuse * sin(angle));
+
+            // Now we draw the main line of the arrow.
+            line(frame, p, q, line_color, line_thickness);
+
+            // Now draw the tips of the arrow. I do some scaling so that the
+            // tips look proportional to the main line of the arrow.
+
+            p.x = (int) (q.x + 9 * cos(angle + CV_PI / 4));
+            p.y = (int) (q.y + 9 * sin(angle + CV_PI / 4));
+            line(frame, p, q, line_color, line_thickness);
+
+            p.x = (int) (q.x + 9 * cos(angle - CV_PI / 4));
+            p.y = (int) (q.y + 9 * sin(angle - CV_PI / 4));
+            line(frame, p, q, line_color, line_thickness);
+        }
+    }
+}
+
+#ifdef HAVE_OPENGL
+
+struct DrawData
+{
+    GlTexture tex;
+    GlArrays arr;
+};
+
+static void drawCallback(void* userdata)
+{
+    DrawData* data = static_cast<DrawData*>(userdata);
+
+    if (data->tex.empty() || data->arr.empty())
+        return;
+
+    static GlCamera camera;
+    static bool init_camera = true;
+
+    if (init_camera)
+    {
+        camera.setOrthoProjection(0.0, 1.0, 1.0, 0.0, 0.0, 1.0);
+        camera.lookAt(Point3d(0.0, 0.0, 1.0), Point3d(0.0, 0.0, 0.0), Point3d(0.0, 1.0, 0.0));
+        init_camera = false;
+    }
+
+    camera.setupProjectionMatrix();
+    camera.setupModelViewMatrix();
+
+    render(data->tex);
+    render(data->arr, RenderMode::TRIANGLES);
+}
+
+#endif
+
+template <typename T> inline T clamp (T x, T a, T b)
+{
+    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
+}
+
+template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
+{
+    x = clamp(x, a, b);
+    return c + (d - c) * (x - a) / (b - a);
+}
+
+static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
+{
+    float maxDisplacement = 1.0f;
+
+    for (int i = 0; i < u.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+        for (int j = 0; j < u.cols; ++j)
+        {
+            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
+
+            if (d > maxDisplacement)
+                maxDisplacement = d;
+        }
+    }
+
+    flowField.create(u.size(), CV_8UC4);
+
+    for (int i = 0; i < flowField.rows; ++i)
+    {
+        const float* ptr_u = u.ptr<float>(i);
+        const float* ptr_v = v.ptr<float>(i);
+
+
+        Vec4b* row = flowField.ptr<Vec4b>(i);
+
+        for (int j = 0; j < flowField.cols; ++j)
+        {
+            row[j][0] = 0;
+            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            row[j][3] = 255;
+        }
+    }
+}
+
+int main(int argc, const char* argv[])
+{
+    const char* keys =
+        "{ h             help   |       | print help message }"
+        "{ l             left   |       | specify left image }"
+        "{ r             right  |       | specify right image }"
+        "{ gray                 |       | use grayscale sources [PyrLK Sparse] }"
+        "{ win_size             | 21    | specify windows size [PyrLK] }"
+        "{ max_level            | 3     | specify max level [PyrLK] }"
+        "{ iters                | 30    | specify iterations count [PyrLK] }"
+        "{ points               | 4000  | specify points count [GoodFeatureToTrack] }"
+        "{ min_dist             | 0     | specify minimal distance between points [GoodFeatureToTrack] }";
+
+    CommandLineParser cmd(argc, argv, keys);
+
+    if (cmd.has("help") || !cmd.check())
+    {
+        cmd.printMessage();
+        cmd.printErrors();
+        return 0;
+    }
+
+    string fname0 = cmd.get<string>("left");
+    string fname1 = cmd.get<string>("right");
+
+    if (fname0.empty() || fname1.empty())
+    {
+        cerr << "Missing input file names" << endl;
+        return -1;
+    }
+
+    bool useGray = cmd.has("gray");
+    int winSize = cmd.get<int>("win_size");
+    int maxLevel = cmd.get<int>("max_level");
+    int iters = cmd.get<int>("iters");
+    int points = cmd.get<int>("points");
+    double minDist = cmd.get<double>("min_dist");
+
+    Mat frame0 = imread(fname0);
+    Mat frame1 = imread(fname1);
+
+    if (frame0.empty() || frame1.empty())
+    {
+        cout << "Can't load input images" << endl;
+        return -1;
+    }
+
+    namedWindow("PyrLK [Sparse]", WINDOW_NORMAL);
+    namedWindow("PyrLK [Dense] Flow Field", WINDOW_NORMAL);
+
+    #ifdef HAVE_OPENGL
+        namedWindow("PyrLK [Dense]", WINDOW_OPENGL);
+
+        setGlDevice();
+    #endif
+
+    cout << "Image size : " << frame0.cols << " x " << frame0.rows << endl;
+    cout << "Points count : " << points << endl;
+
+    cout << endl;
+
+    Mat frame0Gray;
+    cvtColor(frame0, frame0Gray, COLOR_BGR2GRAY);
+    Mat frame1Gray;
+    cvtColor(frame1, frame1Gray, COLOR_BGR2GRAY);
+
+    // goodFeaturesToTrack
+
+    GoodFeaturesToTrackDetector_GPU detector(points, 0.01, minDist);
+
+    GpuMat d_frame0Gray(frame0Gray);
+    GpuMat d_prevPts;
+
+    detector(d_frame0Gray, d_prevPts);
+
+    // Sparse
+
+    PyrLKOpticalFlow d_pyrLK;
+
+    d_pyrLK.winSize.width = winSize;
+    d_pyrLK.winSize.height = winSize;
+    d_pyrLK.maxLevel = maxLevel;
+    d_pyrLK.iters = iters;
+
+    GpuMat d_frame0(frame0);
+    GpuMat d_frame1(frame1);
+    GpuMat d_frame1Gray(frame1Gray);
+    GpuMat d_nextPts;
+    GpuMat d_status;
+
+    d_pyrLK.sparse(useGray ? d_frame0Gray : d_frame0, useGray ? d_frame1Gray : d_frame1, d_prevPts, d_nextPts, d_status);
+
+    // Draw arrows
+
+    vector<Point2f> prevPts(d_prevPts.cols);
+    download(d_prevPts, prevPts);
+
+    vector<Point2f> nextPts(d_nextPts.cols);
+    download(d_nextPts, nextPts);
+
+    vector<uchar> status(d_status.cols);
+    download(d_status, status);
+
+    drawArrows(frame0, prevPts, nextPts, status, Scalar(255, 0, 0));
+
+    imshow("PyrLK [Sparse]", frame0);
+
+    // Dense
+
+    GpuMat d_u;
+    GpuMat d_v;
+
+    d_pyrLK.dense(d_frame0Gray, d_frame1Gray, d_u, d_v);
+
+    // Draw flow field
+
+    Mat flowField;
+    getFlowField(Mat(d_u), Mat(d_v), flowField);
+
+    imshow("PyrLK [Dense] Flow Field", flowField);
+
+    #ifdef HAVE_OPENGL
+        setOpenGlContext("PyrLK [Dense]");
+
+        GpuMat d_vertex, d_colors;
+        createOpticalFlowNeedleMap(d_u, d_v, d_vertex, d_colors);
+
+        DrawData drawData;
+
+        drawData.tex.copyFrom(d_frame0Gray);
+        drawData.arr.setVertexArray(d_vertex);
+        drawData.arr.setColorArray(d_colors, false);
+
+        setOpenGlDrawCallback("PyrLK [Dense]", drawCallback, &drawData);
+    #endif
+
+    waitKey();
+
+    return 0;
+}
--- a/samples/gpu/stereo_match.cpp
+++ b/samples/gpu/stereo_match.cpp
@@ -1,382 +1,382 @@
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <iomanip>
-#include <stdexcept>
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/highgui/highgui.hpp"
-
-using namespace cv;
-using namespace std;
-
-bool help_showed = false;
-
-struct Params
-{
-    Params();
-    static Params read(int argc, char** argv);
-
-    string left;
-    string right;
-
-    string method_str() const
-    {
-        switch (method)
-        {
-        case BM: return "BM";
-        case BP: return "BP";
-        case CSBP: return "CSBP";
-        }
-        return "";
-    }
-    enum {BM, BP, CSBP} method;
-    int ndisp; // Max disparity + 1
-};
-
-
-struct App
-{
-    App(const Params& p);
-    void run();
-    void handleKey(char key);
-    void printParams() const;
-
-    void workBegin() { work_begin = getTickCount(); }
-    void workEnd()
-    {
-        int64 d = getTickCount() - work_begin;
-        double f = getTickFrequency();
-        work_fps = f / d;
-    }
-
-    string text() const
-    {
-        stringstream ss;
-        ss << "(" << p.method_str() << ") FPS: " << setiosflags(ios::left)
-            << setprecision(4) << work_fps;
-        return ss.str();
-    }
-private:
-    Params p;
-    bool running;
-
-    Mat left_src, right_src;
-    Mat left, right;
-    gpu::GpuMat d_left, d_right;
-
-    gpu::StereoBM_GPU bm;
-    gpu::StereoBeliefPropagation bp;
-    gpu::StereoConstantSpaceBP csbp;
-
-    int64 work_begin;
-    double work_fps;
-};
-
-static void printHelp()
-{
-    cout << "Usage: stereo_match_gpu\n"
-        << "\t--left <left_view> --right <right_view> # must be rectified\n"
-        << "\t--method <stereo_match_method> # BM | BP | CSBP\n"
-        << "\t--ndisp <number> # number of disparity levels\n";
-    help_showed = true;
-}
-
-int main(int argc, char** argv)
-{
-    try
-    {
-        if (argc < 2)
-        {
-            printHelp();
-            return 1;
-        }
-        Params args = Params::read(argc, argv);
-        if (help_showed)
-            return -1;
-        App app(args);
-        app.run();
-    }
-    catch (const exception& e)
-    {
-        cout << "error: " << e.what() << endl;
-    }
-    return 0;
-}
-
-
-Params::Params()
-{
-    method = BM;
-    ndisp = 64;
-}
-
-
-Params Params::read(int argc, char** argv)
-{
-    Params p;
-
-    for (int i = 1; i < argc; i++)
-    {
-        if (string(argv[i]) == "--left") p.left = argv[++i];
-        else if (string(argv[i]) == "--right") p.right = argv[++i];
-        else if (string(argv[i]) == "--method")
-        {
-            if (string(argv[i + 1]) == "BM") p.method = BM;
-            else if (string(argv[i + 1]) == "BP") p.method = BP;
-            else if (string(argv[i + 1]) == "CSBP") p.method = CSBP;
-            else throw runtime_error("unknown stereo match method: " + string(argv[i + 1]));
-            i++;
-        }
-        else if (string(argv[i]) == "--ndisp") p.ndisp = atoi(argv[++i]);
-        else if (string(argv[i]) == "--help") printHelp();
-        else throw runtime_error("unknown key: " + string(argv[i]));
-    }
-
-    return p;
-}
-
-
-App::App(const Params& params)
-    : p(params), running(false)
-{
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    cout << "stereo_match_gpu sample\n";
-    cout << "\nControls:\n"
-        << "\tesc - exit\n"
-        << "\tp - print current parameters\n"
-        << "\tg - convert source images into gray\n"
-        << "\tm - change stereo match method\n"
-        << "\ts - change Sobel prefiltering flag (for BM only)\n"
-        << "\t1/q - increase/decrease maximum disparity\n"
-        << "\t2/w - increase/decrease window size (for BM only)\n"
-        << "\t3/e - increase/decrease iteration count (for BP and CSBP only)\n"
-        << "\t4/r - increase/decrease level count (for BP and CSBP only)\n";
-}
-
-
-void App::run()
-{
-    // Load images
-    left_src = imread(p.left);
-    right_src = imread(p.right);
-    if (left_src.empty()) throw runtime_error("can't open file \"" + p.left + "\"");
-    if (right_src.empty()) throw runtime_error("can't open file \"" + p.right + "\"");
-    cvtColor(left_src, left, CV_BGR2GRAY);
-    cvtColor(right_src, right, CV_BGR2GRAY);
-    d_left.upload(left);
-    d_right.upload(right);
-
-    imshow("left", left);
-    imshow("right", right);
-
-    // Set common parameters
-    bm.ndisp = p.ndisp;
-    bp.ndisp = p.ndisp;
-    csbp.ndisp = p.ndisp;
-
-    // Prepare disparity map of specified type
-    Mat disp(left.size(), CV_8U);
-    gpu::GpuMat d_disp(left.size(), CV_8U);
-
-    cout << endl;
-    printParams();
-
-    running = true;
-    while (running)
-    {
-        workBegin();
-        switch (p.method)
-        {
-        case Params::BM:
-            if (d_left.channels() > 1 || d_right.channels() > 1)
-            {
-                cout << "BM doesn't support color images\n";
-                cvtColor(left_src, left, CV_BGR2GRAY);
-                cvtColor(right_src, right, CV_BGR2GRAY);
-                cout << "image_channels: " << left.channels() << endl;
-                d_left.upload(left);
-                d_right.upload(right);
-                imshow("left", left);
-                imshow("right", right);
-            }
-            bm(d_left, d_right, d_disp);
-            break;
-        case Params::BP: bp(d_left, d_right, d_disp); break;
-        case Params::CSBP: csbp(d_left, d_right, d_disp); break;
-        }
-        workEnd();
-
-        // Show results
-        d_disp.download(disp);
-        putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));
-        imshow("disparity", disp);
-
-        handleKey((char)waitKey(3));
-    }
-}
-
-
-void App::printParams() const
-{
-    cout << "--- Parameters ---\n";
-    cout << "image_size: (" << left.cols << ", " << left.rows << ")\n";
-    cout << "image_channels: " << left.channels() << endl;
-    cout << "method: " << p.method_str() << endl
-        << "ndisp: " << p.ndisp << endl;
-    switch (p.method)
-    {
-    case Params::BM:
-        cout << "win_size: " << bm.winSize << endl;
-        cout << "prefilter_sobel: " << bm.preset << endl;
-        break;
-    case Params::BP:
-        cout << "iter_count: " << bp.iters << endl;
-        cout << "level_count: " << bp.levels << endl;
-        break;
-    case Params::CSBP:
-        cout << "iter_count: " << csbp.iters << endl;
-        cout << "level_count: " << csbp.levels << endl;
-        break;
-    }
-    cout << endl;
-}
-
-
-void App::handleKey(char key)
-{
-    switch (key)
-    {
-    case 27:
-        running = false;
-        break;
-    case 'p': case 'P':
-        printParams();
-        break;
-    case 'g': case 'G':
-        if (left.channels() == 1 && p.method != Params::BM)
-        {
-            left = left_src;
-            right = right_src;
-        }
-        else
-        {
-            cvtColor(left_src, left, CV_BGR2GRAY);
-            cvtColor(right_src, right, CV_BGR2GRAY);
-        }
-        d_left.upload(left);
-        d_right.upload(right);
-        cout << "image_channels: " << left.channels() << endl;
-        imshow("left", left);
-        imshow("right", right);
-        break;
-    case 'm': case 'M':
-        switch (p.method)
-        {
-        case Params::BM:
-            p.method = Params::BP;
-            break;
-        case Params::BP:
-            p.method = Params::CSBP;
-            break;
-        case Params::CSBP:
-            p.method = Params::BM;
-            break;
-        }
-        cout << "method: " << p.method_str() << endl;
-        break;
-    case 's': case 'S':
-        if (p.method == Params::BM)
-        {
-            switch (bm.preset)
-            {
-            case gpu::StereoBM_GPU::BASIC_PRESET:
-                bm.preset = gpu::StereoBM_GPU::PREFILTER_XSOBEL;
-                break;
-            case gpu::StereoBM_GPU::PREFILTER_XSOBEL:
-                bm.preset = gpu::StereoBM_GPU::BASIC_PRESET;
-                break;
-            }
-            cout << "prefilter_sobel: " << bm.preset << endl;
-        }
-        break;
-    case '1':
-        p.ndisp = p.ndisp == 1 ? 8 : p.ndisp + 8;
-        cout << "ndisp: " << p.ndisp << endl;
-        bm.ndisp = p.ndisp;
-        bp.ndisp = p.ndisp;
-        csbp.ndisp = p.ndisp;
-        break;
-    case 'q': case 'Q':
-        p.ndisp = max(p.ndisp - 8, 1);
-        cout << "ndisp: " << p.ndisp << endl;
-        bm.ndisp = p.ndisp;
-        bp.ndisp = p.ndisp;
-        csbp.ndisp = p.ndisp;
-        break;
-    case '2':
-        if (p.method == Params::BM)
-        {
-            bm.winSize = min(bm.winSize + 1, 51);
-            cout << "win_size: " << bm.winSize << endl;
-        }
-        break;
-    case 'w': case 'W':
-        if (p.method == Params::BM)
-        {
-            bm.winSize = max(bm.winSize - 1, 2);
-            cout << "win_size: " << bm.winSize << endl;
-        }
-        break;
-    case '3':
-        if (p.method == Params::BP)
-        {
-            bp.iters += 1;
-            cout << "iter_count: " << bp.iters << endl;
-        }
-        else if (p.method == Params::CSBP)
-        {
-            csbp.iters += 1;
-            cout << "iter_count: " << csbp.iters << endl;
-        }
-        break;
-    case 'e': case 'E':
-        if (p.method == Params::BP)
-        {
-            bp.iters = max(bp.iters - 1, 1);
-            cout << "iter_count: " << bp.iters << endl;
-        }
-        else if (p.method == Params::CSBP)
-        {
-            csbp.iters = max(csbp.iters - 1, 1);
-            cout << "iter_count: " << csbp.iters << endl;
-        }
-        break;
-    case '4':
-        if (p.method == Params::BP)
-        {
-            bp.levels += 1;
-            cout << "level_count: " << bp.levels << endl;
-        }
-        else if (p.method == Params::CSBP)
-        {
-            csbp.levels += 1;
-            cout << "level_count: " << csbp.levels << endl;
-        }
-        break;
-    case 'r': case 'R':
-        if (p.method == Params::BP)
-        {
-            bp.levels = max(bp.levels - 1, 1);
-            cout << "level_count: " << bp.levels << endl;
-        }
-        else if (p.method == Params::CSBP)
-        {
-            csbp.levels = max(csbp.levels - 1, 1);
-            cout << "level_count: " << csbp.levels << endl;
-        }
-        break;
-    }
-}
-
-
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+using namespace cv;
+using namespace std;
+
+bool help_showed = false;
+
+struct Params
+{
+    Params();
+    static Params read(int argc, char** argv);
+
+    string left;
+    string right;
+
+    string method_str() const
+    {
+        switch (method)
+        {
+        case BM: return "BM";
+        case BP: return "BP";
+        case CSBP: return "CSBP";
+        }
+        return "";
+    }
+    enum {BM, BP, CSBP} method;
+    int ndisp; // Max disparity + 1
+};
+
+
+struct App
+{
+    App(const Params& p);
+    void run();
+    void handleKey(char key);
+    void printParams() const;
+
+    void workBegin() { work_begin = getTickCount(); }
+    void workEnd()
+    {
+        int64 d = getTickCount() - work_begin;
+        double f = getTickFrequency();
+        work_fps = f / d;
+    }
+
+    string text() const
+    {
+        stringstream ss;
+        ss << "(" << p.method_str() << ") FPS: " << setiosflags(ios::left)
+            << setprecision(4) << work_fps;
+        return ss.str();
+    }
+private:
+    Params p;
+    bool running;
+
+    Mat left_src, right_src;
+    Mat left, right;
+    gpu::GpuMat d_left, d_right;
+
+    gpu::StereoBM_GPU bm;
+    gpu::StereoBeliefPropagation bp;
+    gpu::StereoConstantSpaceBP csbp;
+
+    int64 work_begin;
+    double work_fps;
+};
+
+static void printHelp()
+{
+    cout << "Usage: stereo_match_gpu\n"
+        << "\t--left <left_view> --right <right_view> # must be rectified\n"
+        << "\t--method <stereo_match_method> # BM | BP | CSBP\n"
+        << "\t--ndisp <number> # number of disparity levels\n";
+    help_showed = true;
+}
+
+int main(int argc, char** argv)
+{
+    try
+    {
+        if (argc < 2)
+        {
+            printHelp();
+            return 1;
+        }
+        Params args = Params::read(argc, argv);
+        if (help_showed)
+            return -1;
+        App app(args);
+        app.run();
+    }
+    catch (const exception& e)
+    {
+        cout << "error: " << e.what() << endl;
+    }
+    return 0;
+}
+
+
+Params::Params()
+{
+    method = BM;
+    ndisp = 64;
+}
+
+
+Params Params::read(int argc, char** argv)
+{
+    Params p;
+
+    for (int i = 1; i < argc; i++)
+    {
+        if (string(argv[i]) == "--left") p.left = argv[++i];
+        else if (string(argv[i]) == "--right") p.right = argv[++i];
+        else if (string(argv[i]) == "--method")
+        {
+            if (string(argv[i + 1]) == "BM") p.method = BM;
+            else if (string(argv[i + 1]) == "BP") p.method = BP;
+            else if (string(argv[i + 1]) == "CSBP") p.method = CSBP;
+            else throw runtime_error("unknown stereo match method: " + string(argv[i + 1]));
+            i++;
+        }
+        else if (string(argv[i]) == "--ndisp") p.ndisp = atoi(argv[++i]);
+        else if (string(argv[i]) == "--help") printHelp();
+        else throw runtime_error("unknown key: " + string(argv[i]));
+    }
+
+    return p;
+}
+
+
+App::App(const Params& params)
+    : p(params), running(false)
+{
+    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
+
+    cout << "stereo_match_gpu sample\n";
+    cout << "\nControls:\n"
+        << "\tesc - exit\n"
+        << "\tp - print current parameters\n"
+        << "\tg - convert source images into gray\n"
+        << "\tm - change stereo match method\n"
+        << "\ts - change Sobel prefiltering flag (for BM only)\n"
+        << "\t1/q - increase/decrease maximum disparity\n"
+        << "\t2/w - increase/decrease window size (for BM only)\n"
+        << "\t3/e - increase/decrease iteration count (for BP and CSBP only)\n"
+        << "\t4/r - increase/decrease level count (for BP and CSBP only)\n";
+}
+
+
+void App::run()
+{
+    // Load images
+    left_src = imread(p.left);
+    right_src = imread(p.right);
+    if (left_src.empty()) throw runtime_error("can't open file \"" + p.left + "\"");
+    if (right_src.empty()) throw runtime_error("can't open file \"" + p.right + "\"");
+    cvtColor(left_src, left, CV_BGR2GRAY);
+    cvtColor(right_src, right, CV_BGR2GRAY);
+    d_left.upload(left);
+    d_right.upload(right);
+
+    imshow("left", left);
+    imshow("right", right);
+
+    // Set common parameters
+    bm.ndisp = p.ndisp;
+    bp.ndisp = p.ndisp;
+    csbp.ndisp = p.ndisp;
+
+    // Prepare disparity map of specified type
+    Mat disp(left.size(), CV_8U);
+    gpu::GpuMat d_disp(left.size(), CV_8U);
+
+    cout << endl;
+    printParams();
+
+    running = true;
+    while (running)
+    {
+        workBegin();
+        switch (p.method)
+        {
+        case Params::BM:
+            if (d_left.channels() > 1 || d_right.channels() > 1)
+            {
+                cout << "BM doesn't support color images\n";
+                cvtColor(left_src, left, CV_BGR2GRAY);
+                cvtColor(right_src, right, CV_BGR2GRAY);
+                cout << "image_channels: " << left.channels() << endl;
+                d_left.upload(left);
+                d_right.upload(right);
+                imshow("left", left);
+                imshow("right", right);
+            }
+            bm(d_left, d_right, d_disp);
+            break;
+        case Params::BP: bp(d_left, d_right, d_disp); break;
+        case Params::CSBP: csbp(d_left, d_right, d_disp); break;
+        }
+        workEnd();
+
+        // Show results
+        d_disp.download(disp);
+        putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));
+        imshow("disparity", disp);
+
+        handleKey((char)waitKey(3));
+    }
+}
+
+
+void App::printParams() const
+{
+    cout << "--- Parameters ---\n";
+    cout << "image_size: (" << left.cols << ", " << left.rows << ")\n";
+    cout << "image_channels: " << left.channels() << endl;
+    cout << "method: " << p.method_str() << endl
+        << "ndisp: " << p.ndisp << endl;
+    switch (p.method)
+    {
+    case Params::BM:
+        cout << "win_size: " << bm.winSize << endl;
+        cout << "prefilter_sobel: " << bm.preset << endl;
+        break;
+    case Params::BP:
+        cout << "iter_count: " << bp.iters << endl;
+        cout << "level_count: " << bp.levels << endl;
+        break;
+    case Params::CSBP:
+        cout << "iter_count: " << csbp.iters << endl;
+        cout << "level_count: " << csbp.levels << endl;
+        break;
+    }
+    cout << endl;
+}
+
+
+void App::handleKey(char key)
+{
+    switch (key)
+    {
+    case 27:
+        running = false;
+        break;
+    case 'p': case 'P':
+        printParams();
+        break;
+    case 'g': case 'G':
+        if (left.channels() == 1 && p.method != Params::BM)
+        {
+            left = left_src;
+            right = right_src;
+        }
+        else
+        {
+            cvtColor(left_src, left, CV_BGR2GRAY);
+            cvtColor(right_src, right, CV_BGR2GRAY);
+        }
+        d_left.upload(left);
+        d_right.upload(right);
+        cout << "image_channels: " << left.channels() << endl;
+        imshow("left", left);
+        imshow("right", right);
+        break;
+    case 'm': case 'M':
+        switch (p.method)
+        {
+        case Params::BM:
+            p.method = Params::BP;
+            break;
+        case Params::BP:
+            p.method = Params::CSBP;
+            break;
+        case Params::CSBP:
+            p.method = Params::BM;
+            break;
+        }
+        cout << "method: " << p.method_str() << endl;
+        break;
+    case 's': case 'S':
+        if (p.method == Params::BM)
+        {
+            switch (bm.preset)
+            {
+            case gpu::StereoBM_GPU::BASIC_PRESET:
+                bm.preset = gpu::StereoBM_GPU::PREFILTER_XSOBEL;
+                break;
+            case gpu::StereoBM_GPU::PREFILTER_XSOBEL:
+                bm.preset = gpu::StereoBM_GPU::BASIC_PRESET;
+                break;
+            }
+            cout << "prefilter_sobel: " << bm.preset << endl;
+        }
+        break;
+    case '1':
+        p.ndisp = p.ndisp == 1 ? 8 : p.ndisp + 8;
+        cout << "ndisp: " << p.ndisp << endl;
+        bm.ndisp = p.ndisp;
+        bp.ndisp = p.ndisp;
+        csbp.ndisp = p.ndisp;
+        break;
+    case 'q': case 'Q':
+        p.ndisp = max(p.ndisp - 8, 1);
+        cout << "ndisp: " << p.ndisp << endl;
+        bm.ndisp = p.ndisp;
+        bp.ndisp = p.ndisp;
+        csbp.ndisp = p.ndisp;
+        break;
+    case '2':
+        if (p.method == Params::BM)
+        {
+            bm.winSize = min(bm.winSize + 1, 51);
+            cout << "win_size: " << bm.winSize << endl;
+        }
+        break;
+    case 'w': case 'W':
+        if (p.method == Params::BM)
+        {
+            bm.winSize = max(bm.winSize - 1, 2);
+            cout << "win_size: " << bm.winSize << endl;
+        }
+        break;
+    case '3':
+        if (p.method == Params::BP)
+        {
+            bp.iters += 1;
+            cout << "iter_count: " << bp.iters << endl;
+        }
+        else if (p.method == Params::CSBP)
+        {
+            csbp.iters += 1;
+            cout << "iter_count: " << csbp.iters << endl;
+        }
+        break;
+    case 'e': case 'E':
+        if (p.method == Params::BP)
+        {
+            bp.iters = max(bp.iters - 1, 1);
+            cout << "iter_count: " << bp.iters << endl;
+        }
+        else if (p.method == Params::CSBP)
+        {
+            csbp.iters = max(csbp.iters - 1, 1);
+            cout << "iter_count: " << csbp.iters << endl;
+        }
+        break;
+    case '4':
+        if (p.method == Params::BP)
+        {
+            bp.levels += 1;
+            cout << "level_count: " << bp.levels << endl;
+        }
+        else if (p.method == Params::CSBP)
+        {
+            csbp.levels += 1;
+            cout << "level_count: " << csbp.levels << endl;
+        }
+        break;
+    case 'r': case 'R':
+        if (p.method == Params::BP)
+        {
+            bp.levels = max(bp.levels - 1, 1);
+            cout << "level_count: " << bp.levels << endl;
+        }
+        else if (p.method == Params::CSBP)
+        {
+            csbp.levels = max(csbp.levels - 1, 1);
+            cout << "level_count: " << csbp.levels << endl;
+        }
+        break;
+    }
+}
+
+
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp
@@ -1,152 +1,152 @@
-/* This sample demonstrates working on one piece of data using two GPUs.
-   It splits input into two parts and processes them separately on different
-   GPUs. */
-
-// Disable some warnings which are caused with CUDA headers
-#if defined(_MSC_VER)
-#pragma warning(disable: 4201 4408 4100)
-#endif
-
-#include <iostream>
-#include "cvconfig.h"
-#include "opencv2/core/core.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
-
-int main()
-{
-#if !defined(HAVE_CUDA)
-    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
-#endif
-
-#if !defined(HAVE_TBB)
-    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
-#endif
-
-    return 0;
-}
-
-#else
-
-#include "opencv2/core/internal.hpp" // For TBB wrappers
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-struct Worker { void operator()(int device_id) const; };
-
-// GPUs data
-GpuMat d_left[2];
-GpuMat d_right[2];
-StereoBM_GPU* bm[2];
-GpuMat d_result[2];
-
-// CPU result
-Mat result;
-
-void printHelp()
-{
-    std::cout << "Usage: stereo_multi_gpu --left <image> --right <image>\n";
-}
-
-int main(int argc, char** argv)
-{
-    if (argc < 5)
-    {
-        printHelp();
-        return -1;
-    }
-
-    int num_devices = getCudaEnabledDeviceCount();
-    if (num_devices < 2)
-    {
-        std::cout << "Two or more GPUs are required\n";
-        return -1;
-    }
-    for (int i = 0; i < num_devices; ++i)
-    {
-        cv::gpu::printShortCudaDeviceInfo(i);
-
-        DeviceInfo dev_info(i);
-        if (!dev_info.isCompatible())
-        {
-            std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.majorVersion()
-                 << dev_info.minorVersion() << "\n";
-            return -1;
-        }
-    }
-
-    // Load input data
-    Mat left, right;
-    for (int i = 1; i < argc; ++i)
-    {
-        if (string(argv[i]) == "--left")
-        {
-            left = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE);
-            CV_Assert(!left.empty());
-        }
-        else if (string(argv[i]) == "--right")
-        {
-            right = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE);
-            CV_Assert(!right.empty());
-        }
-        else if (string(argv[i]) == "--help")
-        {
-            printHelp();
-            return -1;
-        }
-    }
-
-    // Split source images for processing on the GPU #0
-    setDevice(0);
-    d_left[0].upload(left.rowRange(0, left.rows / 2));
-    d_right[0].upload(right.rowRange(0, right.rows / 2));
-    bm[0] = new StereoBM_GPU();
-
-    // Split source images for processing on the GPU #1
-    setDevice(1);
-    d_left[1].upload(left.rowRange(left.rows / 2, left.rows));
-    d_right[1].upload(right.rowRange(right.rows / 2, right.rows));
-    bm[1] = new StereoBM_GPU();
-
-    // Execute calculation in two threads using two GPUs
-    int devices[] = {0, 1};
-    parallel_do(devices, devices + 2, Worker());
-
-    // Release the first GPU resources
-    setDevice(0);
-    imshow("GPU #0 result", Mat(d_result[0]));
-    d_left[0].release();
-    d_right[0].release();
-    d_result[0].release();
-    delete bm[0];
-
-    // Release the second GPU resources
-    setDevice(1);
-    imshow("GPU #1 result", Mat(d_result[1]));
-    d_left[1].release();
-    d_right[1].release();
-    d_result[1].release();
-    delete bm[1];
-
-    waitKey();
-    return 0;
-}
-
-
-void Worker::operator()(int device_id) const
-{
-    setDevice(device_id);
-
-    bm[device_id]->operator()(d_left[device_id], d_right[device_id],
-                              d_result[device_id]);
-
-    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name()
-        << "): finished\n";
-}
-
-#endif
+/* This sample demonstrates working on one piece of data using two GPUs.
+   It splits input into two parts and processes them separately on different
+   GPUs. */
+
+// Disable some warnings which are caused with CUDA headers
+#if defined(_MSC_VER)
+#pragma warning(disable: 4201 4408 4100)
+#endif
+
+#include <iostream>
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+#if !defined(HAVE_CUDA) || !defined(HAVE_TBB)
+
+int main()
+{
+#if !defined(HAVE_CUDA)
+    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
+#endif
+
+#if !defined(HAVE_TBB)
+    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
+#endif
+
+    return 0;
+}
+
+#else
+
+#include "opencv2/core/internal.hpp" // For TBB wrappers
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+struct Worker { void operator()(int device_id) const; };
+
+// GPUs data
+GpuMat d_left[2];
+GpuMat d_right[2];
+StereoBM_GPU* bm[2];
+GpuMat d_result[2];
+
+// CPU result
+Mat result;
+
+void printHelp()
+{
+    std::cout << "Usage: stereo_multi_gpu --left <image> --right <image>\n";
+}
+
+int main(int argc, char** argv)
+{
+    if (argc < 5)
+    {
+        printHelp();
+        return -1;
+    }
+
+    int num_devices = getCudaEnabledDeviceCount();
+    if (num_devices < 2)
+    {
+        std::cout << "Two or more GPUs are required\n";
+        return -1;
+    }
+    for (int i = 0; i < num_devices; ++i)
+    {
+        cv::gpu::printShortCudaDeviceInfo(i);
+
+        DeviceInfo dev_info(i);
+        if (!dev_info.isCompatible())
+        {
+            std::cout << "GPU module isn't built for GPU #" << i << " ("
+                 << dev_info.name() << ", CC " << dev_info.majorVersion()
+                 << dev_info.minorVersion() << "\n";
+            return -1;
+        }
+    }
+
+    // Load input data
+    Mat left, right;
+    for (int i = 1; i < argc; ++i)
+    {
+        if (string(argv[i]) == "--left")
+        {
+            left = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE);
+            CV_Assert(!left.empty());
+        }
+        else if (string(argv[i]) == "--right")
+        {
+            right = imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE);
+            CV_Assert(!right.empty());
+        }
+        else if (string(argv[i]) == "--help")
+        {
+            printHelp();
+            return -1;
+        }
+    }
+
+    // Split source images for processing on the GPU #0
+    setDevice(0);
+    d_left[0].upload(left.rowRange(0, left.rows / 2));
+    d_right[0].upload(right.rowRange(0, right.rows / 2));
+    bm[0] = new StereoBM_GPU();
+
+    // Split source images for processing on the GPU #1
+    setDevice(1);
+    d_left[1].upload(left.rowRange(left.rows / 2, left.rows));
+    d_right[1].upload(right.rowRange(right.rows / 2, right.rows));
+    bm[1] = new StereoBM_GPU();
+
+    // Execute calculation in two threads using two GPUs
+    int devices[] = {0, 1};
+    parallel_do(devices, devices + 2, Worker());
+
+    // Release the first GPU resources
+    setDevice(0);
+    imshow("GPU #0 result", Mat(d_result[0]));
+    d_left[0].release();
+    d_right[0].release();
+    d_result[0].release();
+    delete bm[0];
+
+    // Release the second GPU resources
+    setDevice(1);
+    imshow("GPU #1 result", Mat(d_result[1]));
+    d_left[1].release();
+    d_right[1].release();
+    d_result[1].release();
+    delete bm[1];
+
+    waitKey();
+    return 0;
+}
+
+
+void Worker::operator()(int device_id) const
+{
+    setDevice(device_id);
+
+    bm[device_id]->operator()(d_left[device_id], d_right[device_id],
+                              d_result[device_id]);
+
+    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name()
+        << "): finished\n";
+}
+
+#endif
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@@ -1,83 +1,83 @@
-#include <iostream>
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/features2d/features2d.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-static void help()
-{
-    cout << "\nThis program demonstrates using SURF_GPU features detector, descriptor extractor and BruteForceMatcher_GPU" << endl;
-    cout << "\nUsage:\n\tmatcher_simple_gpu --left <image1> --right <image2>" << endl;
-}
-
-int main(int argc, char* argv[])
-{
-    if (argc != 5)
-    {
-        help();
-        return -1;
-    }
-
-    GpuMat img1, img2;
-    for (int i = 1; i < argc; ++i)
-    {
-        if (string(argv[i]) == "--left")
-        {
-            img1.upload(imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE));
-            CV_Assert(!img1.empty());
-        }
-        else if (string(argv[i]) == "--right")
-        {
-            img2.upload(imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE));
-            CV_Assert(!img2.empty());
-        }
-        else if (string(argv[i]) == "--help")
-        {
-            help();
-            return -1;
-        }
-    }
-
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    SURF_GPU surf;
-
-    // detecting keypoints & computing descriptors
-    GpuMat keypoints1GPU, keypoints2GPU;
-    GpuMat descriptors1GPU, descriptors2GPU;
-    surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
-    surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
-
-    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
-    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
-
-    // matching descriptors
-    BFMatcher_GPU matcher(NORM_L2);
-    GpuMat trainIdx, distance;
-    matcher.matchSingle(descriptors1GPU, descriptors2GPU, trainIdx, distance);
-
-    // downloading results
-    vector<KeyPoint> keypoints1, keypoints2;
-    vector<float> descriptors1, descriptors2;
-    vector<DMatch> matches;
-    surf.downloadKeypoints(keypoints1GPU, keypoints1);
-    surf.downloadKeypoints(keypoints2GPU, keypoints2);
-    surf.downloadDescriptors(descriptors1GPU, descriptors1);
-    surf.downloadDescriptors(descriptors2GPU, descriptors2);
-    BFMatcher_GPU::matchDownload(trainIdx, distance, matches);
-
-    // drawing the results
-    Mat img_matches;
-    drawMatches(Mat(img1), keypoints1, Mat(img2), keypoints2, matches, img_matches);
-
-    namedWindow("matches", 0);
-    imshow("matches", img_matches);
-    waitKey(0);
-
-    return 0;
-}
+#include <iostream>
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/features2d/features2d.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+static void help()
+{
+    cout << "\nThis program demonstrates using SURF_GPU features detector, descriptor extractor and BruteForceMatcher_GPU" << endl;
+    cout << "\nUsage:\n\tmatcher_simple_gpu --left <image1> --right <image2>" << endl;
+}
+
+int main(int argc, char* argv[])
+{
+    if (argc != 5)
+    {
+        help();
+        return -1;
+    }
+
+    GpuMat img1, img2;
+    for (int i = 1; i < argc; ++i)
+    {
+        if (string(argv[i]) == "--left")
+        {
+            img1.upload(imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE));
+            CV_Assert(!img1.empty());
+        }
+        else if (string(argv[i]) == "--right")
+        {
+            img2.upload(imread(argv[++i], CV_LOAD_IMAGE_GRAYSCALE));
+            CV_Assert(!img2.empty());
+        }
+        else if (string(argv[i]) == "--help")
+        {
+            help();
+            return -1;
+        }
+    }
+
+    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
+
+    SURF_GPU surf;
+
+    // detecting keypoints & computing descriptors
+    GpuMat keypoints1GPU, keypoints2GPU;
+    GpuMat descriptors1GPU, descriptors2GPU;
+    surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
+    surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
+
+    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
+    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
+
+    // matching descriptors
+    BFMatcher_GPU matcher(NORM_L2);
+    GpuMat trainIdx, distance;
+    matcher.matchSingle(descriptors1GPU, descriptors2GPU, trainIdx, distance);
+
+    // downloading results
+    vector<KeyPoint> keypoints1, keypoints2;
+    vector<float> descriptors1, descriptors2;
+    vector<DMatch> matches;
+    surf.downloadKeypoints(keypoints1GPU, keypoints1);
+    surf.downloadKeypoints(keypoints2GPU, keypoints2);
+    surf.downloadDescriptors(descriptors1GPU, descriptors1);
+    surf.downloadDescriptors(descriptors2GPU, descriptors2);
+    BFMatcher_GPU::matchDownload(trainIdx, distance, matches);
+
+    // drawing the results
+    Mat img_matches;
+    drawMatches(Mat(img1), keypoints1, Mat(img2), keypoints2, matches, img_matches);
+
+    namedWindow("matches", 0);
+    imshow("matches", img_matches);
+    waitKey(0);
+
+    return 0;
+}
--- a/samples/gpu/video_reader.cpp
+++ b/samples/gpu/video_reader.cpp
@@ -1,71 +1,71 @@
-#include <iostream>
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <numeric>
-
-#include <opencv2/core/core.hpp>
-#include <opencv2/core/opengl_interop.hpp>
-#include <opencv2/gpu/gpu.hpp>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/contrib/contrib.hpp>
-
-int main(int argc, const char* argv[])
-{
-    if (argc != 2)
-        return -1;
-
-    const std::string fname(argv[1]);
-
-    cv::namedWindow("CPU", cv::WINDOW_NORMAL);
-    cv::namedWindow("GPU", cv::WINDOW_OPENGL);
-    cv::gpu::setGlDevice();
-
-    cv::Mat frame;
-    cv::VideoCapture reader(fname);
-
-    cv::gpu::GpuMat d_frame;
-    cv::gpu::VideoReader_GPU d_reader(fname);
-    d_reader.dumpFormat(std::cout);
-
-    cv::TickMeter tm;
-    std::vector<double> cpu_times;
-    std::vector<double> gpu_times;
-
-    for (;;)
-    {
-        tm.reset(); tm.start();
-        if (!reader.read(frame))
-            break;
-        tm.stop();
-        cpu_times.push_back(tm.getTimeMilli());
-
-        tm.reset(); tm.start();
-        if (!d_reader.read(d_frame))
-            break;
-        tm.stop();
-        gpu_times.push_back(tm.getTimeMilli());
-
-        cv::imshow("CPU", frame);
-        cv::imshow("GPU", d_frame);
-
-        if (cv::waitKey(3) > 0)
-            break;
-    }
-
-    if (!cpu_times.empty() && !gpu_times.empty())
-    {
-        std::cout << std::endl << "Results:" << std::endl;
-
-        std::sort(cpu_times.begin(), cpu_times.end());
-        std::sort(gpu_times.begin(), gpu_times.end());
-
-        double cpu_avg = std::accumulate(cpu_times.begin(), cpu_times.end(), 0.0) / cpu_times.size();
-        double gpu_avg = std::accumulate(gpu_times.begin(), gpu_times.end(), 0.0) / gpu_times.size();
-
-        std::cout << "CPU : Avg : " << cpu_avg << " ms FPS : " << 1000.0 / cpu_avg << std::endl;
-        std::cout << "GPU : Avg : " << gpu_avg << " ms FPS : " << 1000.0 / gpu_avg << std::endl;
-    }
-
-    return 0;
-}
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <numeric>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/core/opengl_interop.hpp>
+#include <opencv2/gpu/gpu.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/contrib/contrib.hpp>
+
+int main(int argc, const char* argv[])
+{
+    if (argc != 2)
+        return -1;
+
+    const std::string fname(argv[1]);
+
+    cv::namedWindow("CPU", cv::WINDOW_NORMAL);
+    cv::namedWindow("GPU", cv::WINDOW_OPENGL);
+    cv::gpu::setGlDevice();
+
+    cv::Mat frame;
+    cv::VideoCapture reader(fname);
+
+    cv::gpu::GpuMat d_frame;
+    cv::gpu::VideoReader_GPU d_reader(fname);
+    d_reader.dumpFormat(std::cout);
+
+    cv::TickMeter tm;
+    std::vector<double> cpu_times;
+    std::vector<double> gpu_times;
+
+    for (;;)
+    {
+        tm.reset(); tm.start();
+        if (!reader.read(frame))
+            break;
+        tm.stop();
+        cpu_times.push_back(tm.getTimeMilli());
+
+        tm.reset(); tm.start();
+        if (!d_reader.read(d_frame))
+            break;
+        tm.stop();
+        gpu_times.push_back(tm.getTimeMilli());
+
+        cv::imshow("CPU", frame);
+        cv::imshow("GPU", d_frame);
+
+        if (cv::waitKey(3) > 0)
+            break;
+    }
+
+    if (!cpu_times.empty() && !gpu_times.empty())
+    {
+        std::cout << std::endl << "Results:" << std::endl;
+
+        std::sort(cpu_times.begin(), cpu_times.end());
+        std::sort(gpu_times.begin(), gpu_times.end());
+
+        double cpu_avg = std::accumulate(cpu_times.begin(), cpu_times.end(), 0.0) / cpu_times.size();
+        double gpu_avg = std::accumulate(gpu_times.begin(), gpu_times.end(), 0.0) / gpu_times.size();
+
+        std::cout << "CPU : Avg : " << cpu_avg << " ms FPS : " << 1000.0 / cpu_avg << std::endl;
+        std::cout << "GPU : Avg : " << gpu_avg << " ms FPS : " << 1000.0 / gpu_avg << std::endl;
+    }
+
+    return 0;
+}
--- a/samples/gpu/video_writer.cpp
+++ b/samples/gpu/video_writer.cpp
@@ -1,96 +1,96 @@
-#include <iostream>
-#include <vector>
-#include <numeric>
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/contrib/contrib.hpp"
-
-int main(int argc, const char* argv[])
-{
-    if (argc != 2)
-    {
-        std::cerr << "Usage : video_writer <input video file>" << std::endl;
-        return -1;
-    }
-
-    const double FPS = 25.0;
-
-    cv::VideoCapture reader(argv[1]);
-
-    if (!reader.isOpened())
-    {
-        std::cerr << "Can't open input video file" << std::endl;
-        return -1;
-    }
-
-    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
-
-    cv::VideoWriter writer;
-    cv::gpu::VideoWriter_GPU d_writer;
-
-    cv::Mat frame;
-    cv::gpu::GpuMat d_frame;
-
-    std::vector<double> cpu_times;
-    std::vector<double> gpu_times;
-    cv::TickMeter tm;
-
-    for (int i = 1;; ++i)
-    {
-        std::cout << "Read " << i << " frame" << std::endl;
-
-        reader >> frame;
-
-        if (frame.empty())
-        {
-            std::cout << "Stop" << std::endl;
-            break;
-        }
-
-        if (!writer.isOpened())
-        {
-            std::cout << "Frame Size : " << frame.cols << "x" << frame.rows << std::endl;
-
-            std::cout << "Open CPU Writer" << std::endl;
-
-            if (!writer.open("output_cpu.avi", CV_FOURCC('X', 'V', 'I', 'D'), FPS, frame.size()))
-                return -1;
-        }
-
-        if (!d_writer.isOpened())
-        {
-            std::cout << "Open GPU Writer" << std::endl;
-
-            d_writer.open("output_gpu.avi", frame.size(), FPS);
-        }
-
-        d_frame.upload(frame);
-
-        std::cout << "Write " << i << " frame" << std::endl;
-
-        tm.reset(); tm.start();
-        writer.write(frame);
-        tm.stop();
-        cpu_times.push_back(tm.getTimeMilli());
-
-        tm.reset(); tm.start();
-        d_writer.write(d_frame);
-        tm.stop();
-        gpu_times.push_back(tm.getTimeMilli());
-    }
-
-    std::cout << std::endl << "Results:" << std::endl;
-
-    std::sort(cpu_times.begin(), cpu_times.end());
-    std::sort(gpu_times.begin(), gpu_times.end());
-
-    double cpu_avg = std::accumulate(cpu_times.begin(), cpu_times.end(), 0.0) / cpu_times.size();
-    double gpu_avg = std::accumulate(gpu_times.begin(), gpu_times.end(), 0.0) / gpu_times.size();
-
-    std::cout << "CPU [XVID] : Avg : " << cpu_avg << " ms FPS : " << 1000.0 / cpu_avg << std::endl;
-    std::cout << "GPU [H264] : Avg : " << gpu_avg << " ms FPS : " << 1000.0 / gpu_avg << std::endl;
-
-    return 0;
-}
+#include <iostream>
+#include <vector>
+#include <numeric>
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/contrib/contrib.hpp"
+
+int main(int argc, const char* argv[])
+{
+    if (argc != 2)
+    {
+        std::cerr << "Usage : video_writer <input video file>" << std::endl;
+        return -1;
+    }
+
+    const double FPS = 25.0;
+
+    cv::VideoCapture reader(argv[1]);
+
+    if (!reader.isOpened())
+    {
+        std::cerr << "Can't open input video file" << std::endl;
+        return -1;
+    }
+
+    cv::gpu::printShortCudaDeviceInfo(cv::gpu::getDevice());
+
+    cv::VideoWriter writer;
+    cv::gpu::VideoWriter_GPU d_writer;
+
+    cv::Mat frame;
+    cv::gpu::GpuMat d_frame;
+
+    std::vector<double> cpu_times;
+    std::vector<double> gpu_times;
+    cv::TickMeter tm;
+
+    for (int i = 1;; ++i)
+    {
+        std::cout << "Read " << i << " frame" << std::endl;
+
+        reader >> frame;
+
+        if (frame.empty())
+        {
+            std::cout << "Stop" << std::endl;
+            break;
+        }
+
+        if (!writer.isOpened())
+        {
+            std::cout << "Frame Size : " << frame.cols << "x" << frame.rows << std::endl;
+
+            std::cout << "Open CPU Writer" << std::endl;
+
+            if (!writer.open("output_cpu.avi", CV_FOURCC('X', 'V', 'I', 'D'), FPS, frame.size()))
+                return -1;
+        }
+
+        if (!d_writer.isOpened())
+        {
+            std::cout << "Open GPU Writer" << std::endl;
+
+            d_writer.open("output_gpu.avi", frame.size(), FPS);
+        }
+
+        d_frame.upload(frame);
+
+        std::cout << "Write " << i << " frame" << std::endl;
+
+        tm.reset(); tm.start();
+        writer.write(frame);
+        tm.stop();
+        cpu_times.push_back(tm.getTimeMilli());
+
+        tm.reset(); tm.start();
+        d_writer.write(d_frame);
+        tm.stop();
+        gpu_times.push_back(tm.getTimeMilli());
+    }
+
+    std::cout << std::endl << "Results:" << std::endl;
+
+    std::sort(cpu_times.begin(), cpu_times.end());
+    std::sort(gpu_times.begin(), gpu_times.end());
+
+    double cpu_avg = std::accumulate(cpu_times.begin(), cpu_times.end(), 0.0) / cpu_times.size();
+    double gpu_avg = std::accumulate(gpu_times.begin(), gpu_times.end(), 0.0) / gpu_times.size();
+
+    std::cout << "CPU [XVID] : Avg : " << cpu_avg << " ms FPS : " << 1000.0 / cpu_avg << std::endl;
+    std::cout << "GPU [H264] : Avg : " << gpu_avg << " ms FPS : " << 1000.0 / gpu_avg << std::endl;
+
+    return 0;
+}