parallel_do replaced with parallel_for_ in driver_gpu_multi sample.

2015-01-05 13:48:54 +03:00
parent 091c7a3821
commit 72063bf136
1 changed files with 70 additions and 82 deletions
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@@ -7,56 +7,101 @@
 #endif
 #include <iostream>
 #include "cvconfig.h"
 #include "opencv2/core/core.hpp"
 #include "opencv2/gpu/gpu.hpp"
-#if !defined(HAVE_CUDA) || !defined(HAVE_TBB) || defined(__arm__)
+#if defined(__arm__)
 int main()
 {
 #if !defined(HAVE_CUDA)
    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
 #endif
 #if !defined(HAVE_TBB)
    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
 #endif
 #if defined(__arm__)
    std::cout << "Unsupported for ARM CUDA library." << std::endl;
 #endif
    return 0;
 }
 #else
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include "opencv2/core/internal.hpp" // For TBB wrappers
 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 struct Worker { void operator()(int device_id) const; };
 void destroyContexts();
 #define safeCall(expr) safeCall_(expr, #expr, __FILE__, __LINE__)
 inline void safeCall_(int code, const char* expr, const char* file, int line)
 {
    if (code != CUDA_SUCCESS)
    {
        std::cout << "CUDA driver API error: code " << code << ", expr " << expr
-            << ", file " << file << ", line " << line << endl;
+        << ", file " << file << ", line " << line << endl;
        destroyContexts();
        exit(-1);
    }
 }
-// Each GPU is associated with its own context
+struct Worker: public ParallelLoopBody
-CUcontext contexts[2];
+{
    Worker(int num_devices)
    {
        count = num_devices;
        contexts = new contexts CUcontext[num_devices];
        for (int device_id = 0; i < num_devices; device_id++)
        {
            CUdevice device;
            safeCall(cuDeviceGet(&device, device_id));
            safeCall(cuCtxCreate(&contexts[device_id], 0, device));
        }
    }
    virtual void operator() (const Range& range) const
    {
        for (int device_id = range.start; device_id != range.end; ++device_id)
        {
            // Set the proper context
            safeCall(cuCtxPushCurrent(contexts[device_id]));
            Mat src(1000, 1000, CV_32F);
            Mat dst;
            RNG rng(0);
            rng.fill(src, RNG::UNIFORM, 0, 1);
            // CPU works
            transpose(src, dst);
            // GPU works
            GpuMat d_src(src);
            GpuMat d_dst;
            transpose(d_src, d_dst);
            // Check results
            bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
            std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
            << (passed ? "passed" : "FAILED") << endl;
            // Deallocate data here, otherwise deallocation will be performed
            // after context is extracted from the stack
            d_src.release();
            d_dst.release();
            CUcontext prev_context;
            safeCall(cuCtxPopCurrent(&prev_context));
        }
    }
    ~Worker()
    {
        if ((contexts != NULL) && count != 0)
        {
            for (int device_id = 0; i < num_devices; device_id++)
            {
                safeCall(cuCtxDestroy(contexts[device_id]));
            }
            delete[] contexts;
        }
    }
    CUcontext* contexts;
    int count;
 };
 int main()
 {
@@ -84,67 +129,10 @@ int main()
    // Init CUDA Driver API
    safeCall(cuInit(0));
-    // Create context for GPU #0
+    // Execute calculation
-    CUdevice device;
+    parallel_for_(cv::Range(0, num_devices, Worker(num_devices));
    safeCall(cuDeviceGet(&device, 0));
    safeCall(cuCtxCreate(&contexts[0], 0, device));
    CUcontext prev_context;
    safeCall(cuCtxPopCurrent(&prev_context));
    // Create context for GPU #1
    safeCall(cuDeviceGet(&device, 1));
    safeCall(cuCtxCreate(&contexts[1], 0, device));
    safeCall(cuCtxPopCurrent(&prev_context));
    // Execute calculation in two threads using two GPUs
    int devices[] = {0, 1};
    parallel_do(devices, devices + 2, Worker());
    destroyContexts();
    return 0;
 }
 void Worker::operator()(int device_id) const
 {
    // Set the proper context
    safeCall(cuCtxPushCurrent(contexts[device_id]));
    Mat src(1000, 1000, CV_32F);
    Mat dst;
    RNG rng(0);
    rng.fill(src, RNG::UNIFORM, 0, 1);
    // CPU works
    transpose(src, dst);
    // GPU works
    GpuMat d_src(src);
    GpuMat d_dst;
    transpose(d_src, d_dst);
    // Check results
    bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
        << (passed ? "passed" : "FAILED") << endl;
    // Deallocate data here, otherwise deallocation will be performed
    // after context is extracted from the stack
    d_src.release();
    d_dst.release();
    CUcontext prev_context;
    safeCall(cuCtxPopCurrent(&prev_context));
 }
 void destroyContexts()
 {
    safeCall(cuCtxDestroy(contexts[0]));
    safeCall(cuCtxDestroy(contexts[1]));
 }
 #endif