501 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			501 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // This sample demonstrates working on one piece of data using two GPUs.
 | |
| // It splits input into two parts and processes them separately on different GPUs.
 | |
| 
 | |
| #ifdef WIN32
 | |
|     #define NOMINMAX
 | |
|     #include <windows.h>
 | |
| #else
 | |
|     #include <pthread.h>
 | |
|     #include <unistd.h>
 | |
| #endif
 | |
| 
 | |
| #include <iostream>
 | |
| #include <iomanip>
 | |
| 
 | |
| #include "opencv2/core.hpp"
 | |
| #include "opencv2/highgui.hpp"
 | |
| #include "opencv2/imgproc.hpp"
 | |
| #include "opencv2/cudastereo.hpp"
 | |
| 
 | |
| #include "tick_meter.hpp"
 | |
| 
 | |
| using namespace std;
 | |
| using namespace cv;
 | |
| using namespace cv::cuda;
 | |
| 
 | |
| ///////////////////////////////////////////////////////////
 | |
| // Thread
 | |
| // OS-specific wrappers for multi-threading
 | |
| 
 | |
| #ifdef WIN32
 | |
| class Thread
 | |
| {
 | |
|     struct UserData
 | |
|     {
 | |
|         void (*func)(void* userData);
 | |
|         void* param;
 | |
|     };
 | |
| 
 | |
|     static DWORD WINAPI WinThreadFunction(LPVOID lpParam)
 | |
|     {
 | |
|         UserData* userData = static_cast<UserData*>(lpParam);
 | |
| 
 | |
|         userData->func(userData->param);
 | |
| 
 | |
|         return 0;
 | |
|     }
 | |
| 
 | |
|     UserData userData_;
 | |
|     HANDLE thread_;
 | |
|     DWORD threadId_;
 | |
| 
 | |
| public:
 | |
|     Thread(void (*func)(void* userData), void* userData)
 | |
|     {
 | |
|         userData_.func = func;
 | |
|         userData_.param = userData;
 | |
| 
 | |
|         thread_ = CreateThread(
 | |
|             NULL,                   // default security attributes
 | |
|             0,                      // use default stack size
 | |
|             WinThreadFunction,      // thread function name
 | |
|             &userData_,             // argument to thread function
 | |
|             0,                      // use default creation flags
 | |
|             &threadId_);            // returns the thread identifier
 | |
|     }
 | |
| 
 | |
|     ~Thread()
 | |
|     {
 | |
|         CloseHandle(thread_);
 | |
|     }
 | |
| 
 | |
|     void wait()
 | |
|     {
 | |
|         WaitForSingleObject(thread_, INFINITE);
 | |
|     }
 | |
| };
 | |
| #else
 | |
| class Thread
 | |
| {
 | |
|     struct UserData
 | |
|     {
 | |
|         void (*func)(void* userData);
 | |
|         void* param;
 | |
|     };
 | |
| 
 | |
|     static void* PThreadFunction(void* lpParam)
 | |
|     {
 | |
|         UserData* userData = static_cast<UserData*>(lpParam);
 | |
| 
 | |
|         userData->func(userData->param);
 | |
| 
 | |
|         return 0;
 | |
|     }
 | |
| 
 | |
|     pthread_t thread_;
 | |
|     UserData userData_;
 | |
| 
 | |
| public:
 | |
|     Thread(void (*func)(void* userData), void* userData)
 | |
|     {
 | |
|         userData_.func = func;
 | |
|         userData_.param = userData;
 | |
| 
 | |
|         pthread_create(&thread_, NULL, PThreadFunction, &userData_);
 | |
|     }
 | |
| 
 | |
|     ~Thread()
 | |
|     {
 | |
|         pthread_detach(thread_);
 | |
|     }
 | |
| 
 | |
|     void wait()
 | |
|     {
 | |
|         pthread_join(thread_, NULL);
 | |
|     }
 | |
| };
 | |
| #endif
 | |
| 
 | |
| ///////////////////////////////////////////////////////////
 | |
| // StereoSingleGpu
 | |
| // Run Stereo algorithm on single GPU
 | |
| 
 | |
| class StereoSingleGpu
 | |
| {
 | |
| public:
 | |
|     explicit StereoSingleGpu(int deviceId = 0);
 | |
|     ~StereoSingleGpu();
 | |
| 
 | |
|     void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
 | |
| 
 | |
| private:
 | |
|     int deviceId_;
 | |
|     GpuMat d_leftFrame;
 | |
|     GpuMat d_rightFrame;
 | |
|     GpuMat d_disparity;
 | |
|     Ptr<cuda::StereoBM> d_alg;
 | |
| };
 | |
| 
 | |
| StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId)
 | |
| {
 | |
|     cuda::setDevice(deviceId_);
 | |
|     d_alg = cuda::createStereoBM(256);
 | |
| }
 | |
| 
 | |
| StereoSingleGpu::~StereoSingleGpu()
 | |
| {
 | |
|     cuda::setDevice(deviceId_);
 | |
|     d_leftFrame.release();
 | |
|     d_rightFrame.release();
 | |
|     d_disparity.release();
 | |
|     d_alg.release();
 | |
| }
 | |
| 
 | |
| void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
 | |
| {
 | |
|     cuda::setDevice(deviceId_);
 | |
|     d_leftFrame.upload(leftFrame);
 | |
|     d_rightFrame.upload(rightFrame);
 | |
|     d_alg->compute(d_leftFrame, d_rightFrame, d_disparity);
 | |
|     d_disparity.download(disparity);
 | |
| }
 | |
| 
 | |
| ///////////////////////////////////////////////////////////
 | |
| // StereoMultiGpuThread
 | |
| // Run Stereo algorithm on two GPUs using different host threads
 | |
| 
 | |
| class StereoMultiGpuThread
 | |
| {
 | |
| public:
 | |
|     StereoMultiGpuThread();
 | |
|     ~StereoMultiGpuThread();
 | |
| 
 | |
|     void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
 | |
| 
 | |
| private:
 | |
|     GpuMat d_leftFrames[2];
 | |
|     GpuMat d_rightFrames[2];
 | |
|     GpuMat d_disparities[2];
 | |
|     Ptr<cuda::StereoBM> d_algs[2];
 | |
| 
 | |
|     struct StereoLaunchData
 | |
|     {
 | |
|         int deviceId;
 | |
|         Mat leftFrame;
 | |
|         Mat rightFrame;
 | |
|         Mat disparity;
 | |
|         GpuMat* d_leftFrame;
 | |
|         GpuMat* d_rightFrame;
 | |
|         GpuMat* d_disparity;
 | |
|         Ptr<cuda::StereoBM> d_alg;
 | |
|     };
 | |
| 
 | |
|     static void launchGpuStereoAlg(void* userData);
 | |
| };
 | |
| 
 | |
| StereoMultiGpuThread::StereoMultiGpuThread()
 | |
| {
 | |
|     cuda::setDevice(0);
 | |
|     d_algs[0] = cuda::createStereoBM(256);
 | |
| 
 | |
|     cuda::setDevice(1);
 | |
|     d_algs[1] = cuda::createStereoBM(256);
 | |
| }
 | |
| 
 | |
| StereoMultiGpuThread::~StereoMultiGpuThread()
 | |
| {
 | |
|     cuda::setDevice(0);
 | |
|     d_leftFrames[0].release();
 | |
|     d_rightFrames[0].release();
 | |
|     d_disparities[0].release();
 | |
|     d_algs[0].release();
 | |
| 
 | |
|     cuda::setDevice(1);
 | |
|     d_leftFrames[1].release();
 | |
|     d_rightFrames[1].release();
 | |
|     d_disparities[1].release();
 | |
|     d_algs[1].release();
 | |
| }
 | |
| 
 | |
| void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
 | |
| {
 | |
|     disparity.create(leftFrame.size(), CV_8UC1);
 | |
| 
 | |
|     // Split input data onto two parts for each GPUs.
 | |
|     // We add small border for each part,
 | |
|     // because original algorithm doesn't calculate disparity on image borders.
 | |
|     // With such padding we will get output in the middle of final result.
 | |
| 
 | |
|     StereoLaunchData launchDatas[2];
 | |
| 
 | |
|     launchDatas[0].deviceId = 0;
 | |
|     launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32);
 | |
|     launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32);
 | |
|     launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2);
 | |
|     launchDatas[0].d_leftFrame = &d_leftFrames[0];
 | |
|     launchDatas[0].d_rightFrame = &d_rightFrames[0];
 | |
|     launchDatas[0].d_disparity = &d_disparities[0];
 | |
|     launchDatas[0].d_alg = d_algs[0];
 | |
| 
 | |
|     launchDatas[1].deviceId = 1;
 | |
|     launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
 | |
|     launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
 | |
|     launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows);
 | |
|     launchDatas[1].d_leftFrame = &d_leftFrames[1];
 | |
|     launchDatas[1].d_rightFrame = &d_rightFrames[1];
 | |
|     launchDatas[1].d_disparity = &d_disparities[1];
 | |
|     launchDatas[1].d_alg = d_algs[1];
 | |
| 
 | |
|     Thread thread0(launchGpuStereoAlg, &launchDatas[0]);
 | |
|     Thread thread1(launchGpuStereoAlg, &launchDatas[1]);
 | |
| 
 | |
|     thread0.wait();
 | |
|     thread1.wait();
 | |
| }
 | |
| 
 | |
| void StereoMultiGpuThread::launchGpuStereoAlg(void* userData)
 | |
| {
 | |
|     StereoLaunchData* data = static_cast<StereoLaunchData*>(userData);
 | |
| 
 | |
|     cuda::setDevice(data->deviceId);
 | |
|     data->d_leftFrame->upload(data->leftFrame);
 | |
|     data->d_rightFrame->upload(data->rightFrame);
 | |
|     data->d_alg->compute(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity);
 | |
| 
 | |
|     if (data->deviceId == 0)
 | |
|         data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity);
 | |
|     else
 | |
|         data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity);
 | |
| }
 | |
| 
 | |
| ///////////////////////////////////////////////////////////
 | |
| // StereoMultiGpuStream
 | |
| // Run Stereo algorithm on two GPUs from single host thread using async API
 | |
| 
 | |
| class StereoMultiGpuStream
 | |
| {
 | |
| public:
 | |
|     StereoMultiGpuStream();
 | |
|     ~StereoMultiGpuStream();
 | |
| 
 | |
|     void compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity);
 | |
| 
 | |
| private:
 | |
|     GpuMat d_leftFrames[2];
 | |
|     GpuMat d_rightFrames[2];
 | |
|     GpuMat d_disparities[2];
 | |
|     Ptr<cuda::StereoBM> d_algs[2];
 | |
|     Ptr<Stream> streams[2];
 | |
| };
 | |
| 
 | |
| StereoMultiGpuStream::StereoMultiGpuStream()
 | |
| {
 | |
|     cuda::setDevice(0);
 | |
|     d_algs[0] = cuda::createStereoBM(256);
 | |
|     streams[0] = makePtr<Stream>();
 | |
| 
 | |
|     cuda::setDevice(1);
 | |
|     d_algs[1] = cuda::createStereoBM(256);
 | |
|     streams[1] = makePtr<Stream>();
 | |
| }
 | |
| 
 | |
| StereoMultiGpuStream::~StereoMultiGpuStream()
 | |
| {
 | |
|     cuda::setDevice(0);
 | |
|     d_leftFrames[0].release();
 | |
|     d_rightFrames[0].release();
 | |
|     d_disparities[0].release();
 | |
|     d_algs[0].release();
 | |
|     streams[0].release();
 | |
| 
 | |
|     cuda::setDevice(1);
 | |
|     d_leftFrames[1].release();
 | |
|     d_rightFrames[1].release();
 | |
|     d_disparities[1].release();
 | |
|     d_algs[1].release();
 | |
|     streams[1].release();
 | |
| }
 | |
| 
 | |
| void StereoMultiGpuStream::compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity)
 | |
| {
 | |
|     disparity.create(leftFrame.size(), CV_8UC1);
 | |
| 
 | |
|     // Split input data onto two parts for each GPUs.
 | |
|     // We add small border for each part,
 | |
|     // because original algorithm doesn't calculate disparity on image borders.
 | |
|     // With such padding we will get output in the middle of final result.
 | |
| 
 | |
|     Mat leftFrameHdr = leftFrame.createMatHeader();
 | |
|     Mat rightFrameHdr = rightFrame.createMatHeader();
 | |
|     Mat disparityHdr = disparity.createMatHeader();
 | |
|     Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2);
 | |
|     Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows);
 | |
| 
 | |
|     cuda::setDevice(0);
 | |
|     d_leftFrames[0].upload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
 | |
|     d_rightFrames[0].upload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
 | |
|     d_algs[0]->compute(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]);
 | |
|     d_disparities[0].rowRange(0, leftFrame.rows / 2).download(disparityPart0, *streams[0]);
 | |
| 
 | |
|     cuda::setDevice(1);
 | |
|     d_leftFrames[1].upload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
 | |
|     d_rightFrames[1].upload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
 | |
|     d_algs[1]->compute(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]);
 | |
|     d_disparities[1].rowRange(32, d_disparities[1].rows).download(disparityPart1, *streams[1]);
 | |
| 
 | |
|     cuda::setDevice(0);
 | |
|     streams[0]->waitForCompletion();
 | |
| 
 | |
|     cuda::setDevice(1);
 | |
|     streams[1]->waitForCompletion();
 | |
| }
 | |
| 
 | |
| ///////////////////////////////////////////////////////////
 | |
| // main
 | |
| 
 | |
| int main(int argc, char** argv)
 | |
| {
 | |
|     if (argc != 3)
 | |
|     {
 | |
|         cerr << "Usage: stereo_multi_gpu <left_video> <right_video>" << endl;
 | |
|         return -1;
 | |
|     }
 | |
| 
 | |
|     const int numDevices = getCudaEnabledDeviceCount();
 | |
|     if (numDevices != 2)
 | |
|     {
 | |
|         cerr << "Two GPUs are required" << endl;
 | |
|         return -1;
 | |
|     }
 | |
| 
 | |
|     for (int i = 0; i < numDevices; ++i)
 | |
|     {
 | |
|         DeviceInfo devInfo(i);
 | |
|         if (!devInfo.isCompatible())
 | |
|         {
 | |
|             cerr << "CUDA module was't built for GPU #" << i << " ("
 | |
|                  << devInfo.name() << ", CC " << devInfo.majorVersion()
 | |
|                  << devInfo.minorVersion() << endl;
 | |
|             return -1;
 | |
|         }
 | |
| 
 | |
|         printShortCudaDeviceInfo(i);
 | |
|     }
 | |
| 
 | |
|     VideoCapture leftVideo(argv[1]);
 | |
|     VideoCapture rightVideo(argv[2]);
 | |
| 
 | |
|     if (!leftVideo.isOpened())
 | |
|     {
 | |
|          cerr << "Can't open " << argv[1] << " video file" << endl;
 | |
|          return -1;
 | |
|     }
 | |
| 
 | |
|     if (!rightVideo.isOpened())
 | |
|     {
 | |
|          cerr << "Can't open " << argv[2] << " video file" << endl;
 | |
|          return -1;
 | |
|     }
 | |
| 
 | |
|     cout << endl;
 | |
|     cout << "This sample demonstrates working on one piece of data using two GPUs." << endl;
 | |
|     cout << "It splits input into two parts and processes them separately on different GPUs." << endl;
 | |
|     cout << endl;
 | |
| 
 | |
|     Mat leftFrame, rightFrame;
 | |
|     HostMem leftGrayFrame, rightGrayFrame;
 | |
| 
 | |
|     StereoSingleGpu gpu0Alg(0);
 | |
|     StereoSingleGpu gpu1Alg(1);
 | |
|     StereoMultiGpuThread multiThreadAlg;
 | |
|     StereoMultiGpuStream multiStreamAlg;
 | |
| 
 | |
|     Mat disparityGpu0;
 | |
|     Mat disparityGpu1;
 | |
|     Mat disparityMultiThread;
 | |
|     HostMem disparityMultiStream;
 | |
| 
 | |
|     Mat disparityGpu0Show;
 | |
|     Mat disparityGpu1Show;
 | |
|     Mat disparityMultiThreadShow;
 | |
|     Mat disparityMultiStreamShow;
 | |
| 
 | |
|     TickMeter tm;
 | |
| 
 | |
|     cout << "-------------------------------------------------------------------" << endl;
 | |
|     cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl;
 | |
|     cout << "-------------------------------------------------------------------" << endl;
 | |
| 
 | |
|     for (int i = 0;; ++i)
 | |
|     {
 | |
|         leftVideo >> leftFrame;
 | |
|         rightVideo >> rightFrame;
 | |
| 
 | |
|         if (leftFrame.empty() || rightFrame.empty())
 | |
|             break;
 | |
| 
 | |
|         if (leftFrame.size() != rightFrame.size())
 | |
|         {
 | |
|             cerr << "Frames have different sizes" << endl;
 | |
|             return -1;
 | |
|         }
 | |
| 
 | |
|         leftGrayFrame.create(leftFrame.size(), CV_8UC1);
 | |
|         rightGrayFrame.create(leftFrame.size(), CV_8UC1);
 | |
| 
 | |
|         cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
 | |
|         cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
 | |
| 
 | |
|         tm.reset(); tm.start();
 | |
|         gpu0Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
 | |
|                         disparityGpu0);
 | |
|         tm.stop();
 | |
| 
 | |
|         const double gpu0Time = tm.getTimeMilli();
 | |
| 
 | |
|         tm.reset(); tm.start();
 | |
|         gpu1Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
 | |
|                         disparityGpu1);
 | |
|         tm.stop();
 | |
| 
 | |
|         const double gpu1Time = tm.getTimeMilli();
 | |
| 
 | |
|         tm.reset(); tm.start();
 | |
|         multiThreadAlg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
 | |
|                                disparityMultiThread);
 | |
|         tm.stop();
 | |
| 
 | |
|         const double multiThreadTime = tm.getTimeMilli();
 | |
| 
 | |
|         tm.reset(); tm.start();
 | |
|         multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream);
 | |
|         tm.stop();
 | |
| 
 | |
|         const double multiStreamTime = tm.getTimeMilli();
 | |
| 
 | |
|         cout << "| " << setw(5) << i << " | "
 | |
|              << setw(8) << setprecision(1) << fixed << gpu0Time << " | "
 | |
|              << setw(8) << setprecision(1) << fixed << gpu1Time << " | "
 | |
|              << setw(15) << setprecision(1) << fixed << multiThreadTime << " | "
 | |
|              << setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl;
 | |
| 
 | |
|         resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA);
 | |
|         resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA);
 | |
|         resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA);
 | |
|         resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA);
 | |
| 
 | |
|         imshow("disparityGpu0", disparityGpu0Show);
 | |
|         imshow("disparityGpu1", disparityGpu1Show);
 | |
|         imshow("disparityMultiThread", disparityMultiThreadShow);
 | |
|         imshow("disparityMultiStream", disparityMultiStreamShow);
 | |
| 
 | |
|         const int key = waitKey(30) & 0xff;
 | |
|         if (key == 27)
 | |
|             break;
 | |
|     }
 | |
| 
 | |
|     cout << "-------------------------------------------------------------------" << endl;
 | |
| 
 | |
|     return 0;
 | |
| }
 | 
