Merge pull request #1492 from jet47:gpucodec-cudev
This commit is contained in:
commit
e290436a4c
@ -4,9 +4,9 @@ endif()
|
|||||||
|
|
||||||
set(the_description "CUDA-accelerated Video Encoding/Decoding")
|
set(the_description "CUDA-accelerated Video Encoding/Decoding")
|
||||||
|
|
||||||
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
|
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef)
|
||||||
|
|
||||||
ocv_add_module(cudacodec opencv_highgui)
|
ocv_add_module(cudacodec opencv_highgui OPTIONAL opencv_cudev)
|
||||||
|
|
||||||
ocv_module_include_directories()
|
ocv_module_include_directories()
|
||||||
ocv_glob_module_sources()
|
ocv_glob_module_sources()
|
||||||
|
@ -47,13 +47,26 @@
|
|||||||
* source and converts to output in ARGB format
|
* source and converts to output in ARGB format
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "opencv2/core/cuda/common.hpp"
|
#include "opencv2/opencv_modules.hpp"
|
||||||
|
|
||||||
namespace cv { namespace cuda { namespace device
|
#ifndef HAVE_OPENCV_CUDEV
|
||||||
|
|
||||||
|
#error "opencv_cudev is required"
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#include "opencv2/cudev/common.hpp"
|
||||||
|
|
||||||
|
using namespace cv;
|
||||||
|
using namespace cv::cudev;
|
||||||
|
|
||||||
|
void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height);
|
||||||
|
|
||||||
|
namespace
|
||||||
{
|
{
|
||||||
__constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};
|
__constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};
|
||||||
|
|
||||||
__device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
|
__device__ static void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
|
||||||
{
|
{
|
||||||
float luma, chromaCb, chromaCr;
|
float luma, chromaCb, chromaCr;
|
||||||
|
|
||||||
@ -76,7 +89,7 @@ namespace cv { namespace cuda { namespace device
|
|||||||
(chromaCr * constHueColorSpaceMat[8]);
|
(chromaCr * constHueColorSpaceMat[8]);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ uint RGBA_pack_10bit(float red, float green, float blue, uint alpha)
|
__device__ static uint RGBA_pack_10bit(float red, float green, float blue, uint alpha)
|
||||||
{
|
{
|
||||||
uint ARGBpixel = 0;
|
uint ARGBpixel = 0;
|
||||||
|
|
||||||
@ -99,7 +112,7 @@ namespace cv { namespace cuda { namespace device
|
|||||||
#define COLOR_COMPONENT_BIT_SIZE 10
|
#define COLOR_COMPONENT_BIT_SIZE 10
|
||||||
#define COLOR_COMPONENT_MASK 0x3FF
|
#define COLOR_COMPONENT_MASK 0x3FF
|
||||||
|
|
||||||
__global__ void NV12_to_RGB(uchar* srcImage, size_t nSourcePitch,
|
__global__ void NV12_to_RGB(const uchar* srcImage, size_t nSourcePitch,
|
||||||
uint* dstImage, size_t nDestPitch,
|
uint* dstImage, size_t nDestPitch,
|
||||||
uint width, uint height)
|
uint width, uint height)
|
||||||
{
|
{
|
||||||
@ -171,18 +184,24 @@ namespace cv { namespace cuda { namespace device
|
|||||||
dstImage[y * dstImagePitch + x ] = RGBA_pack_10bit(red[0], green[0], blue[0], ((uint)0xff << 24));
|
dstImage[y * dstImagePitch + x ] = RGBA_pack_10bit(red[0], green[0], blue[0], ((uint)0xff << 24));
|
||||||
dstImage[y * dstImagePitch + x + 1 ] = RGBA_pack_10bit(red[1], green[1], blue[1], ((uint)0xff << 24));
|
dstImage[y * dstImagePitch + x + 1 ] = RGBA_pack_10bit(red[1], green[1], blue[1], ((uint)0xff << 24));
|
||||||
}
|
}
|
||||||
|
|
||||||
void NV12_to_RGB(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
|
|
||||||
{
|
|
||||||
dim3 block(32, 8);
|
|
||||||
dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));
|
|
||||||
|
|
||||||
NV12_to_RGB<<<grid, block, 0, stream>>>(decodedFrame.data, decodedFrame.step, interopFrame.data, interopFrame.step,
|
|
||||||
interopFrame.cols, interopFrame.rows);
|
|
||||||
|
|
||||||
cudaSafeCall( cudaGetLastError() );
|
|
||||||
|
|
||||||
if (stream == 0)
|
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
|
||||||
}
|
}
|
||||||
}}}
|
|
||||||
|
void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height)
|
||||||
|
{
|
||||||
|
// Final Stage: NV12toARGB color space conversion
|
||||||
|
|
||||||
|
_outFrame.create(height, width, CV_8UC4);
|
||||||
|
GpuMat outFrame = _outFrame.getGpuMat();
|
||||||
|
|
||||||
|
dim3 block(32, 8);
|
||||||
|
dim3 grid(divUp(width, 2 * block.x), divUp(height, block.y));
|
||||||
|
|
||||||
|
NV12_to_RGB<<<grid, block>>>(decodedFrame.ptr<uchar>(), decodedFrame.step,
|
||||||
|
outFrame.ptr<uint>(), outFrame.step,
|
||||||
|
width, height);
|
||||||
|
|
||||||
|
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
|
||||||
|
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@ -40,10 +40,21 @@
|
|||||||
//
|
//
|
||||||
//M*/
|
//M*/
|
||||||
|
|
||||||
#include "opencv2/core/cuda/common.hpp"
|
#include "opencv2/opencv_modules.hpp"
|
||||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
|
||||||
|
|
||||||
namespace cv { namespace cuda { namespace device
|
#ifndef HAVE_OPENCV_CUDEV
|
||||||
|
|
||||||
|
#error "opencv_cudev is required"
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#include "opencv2/cudev/ptr2d/glob.hpp"
|
||||||
|
|
||||||
|
using namespace cv::cudev;
|
||||||
|
|
||||||
|
void RGB_to_YV12(const GpuMat& src, GpuMat& dst);
|
||||||
|
|
||||||
|
namespace
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ void rgb_to_y(const uchar b, const uchar g, const uchar r, uchar& y)
|
__device__ __forceinline__ void rgb_to_y(const uchar b, const uchar g, const uchar r, uchar& y)
|
||||||
{
|
{
|
||||||
@ -57,7 +68,7 @@ namespace cv { namespace cuda { namespace device
|
|||||||
v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
|
v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst)
|
__global__ void Gray_to_YV12(const GlobPtrSz<uchar> src, GlobPtr<uchar> dst)
|
||||||
{
|
{
|
||||||
const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
|
const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
|
||||||
const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
|
const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
|
||||||
@ -67,9 +78,9 @@ namespace cv { namespace cuda { namespace device
|
|||||||
|
|
||||||
// get pointers to the data
|
// get pointers to the data
|
||||||
const size_t planeSize = src.rows * dst.step;
|
const size_t planeSize = src.rows * dst.step;
|
||||||
PtrStepb y_plane(dst.data, dst.step);
|
GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
|
||||||
PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
|
GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
|
||||||
PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
|
GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
|
||||||
|
|
||||||
uchar pix;
|
uchar pix;
|
||||||
uchar y_val, u_val, v_val;
|
uchar y_val, u_val, v_val;
|
||||||
@ -94,7 +105,7 @@ namespace cv { namespace cuda { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__global__ void RGB_to_YV12(const PtrStepSz<T> src, PtrStepb dst)
|
__global__ void RGB_to_YV12(const GlobPtrSz<T> src, GlobPtr<uchar> dst)
|
||||||
{
|
{
|
||||||
const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
|
const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
|
||||||
const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
|
const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
|
||||||
@ -104,9 +115,9 @@ namespace cv { namespace cuda { namespace device
|
|||||||
|
|
||||||
// get pointers to the data
|
// get pointers to the data
|
||||||
const size_t planeSize = src.rows * dst.step;
|
const size_t planeSize = src.rows * dst.step;
|
||||||
PtrStepb y_plane(dst.data, dst.step);
|
GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
|
||||||
PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
|
GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
|
||||||
PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
|
GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
|
||||||
|
|
||||||
T pix;
|
T pix;
|
||||||
uchar y_val, u_val, v_val;
|
uchar y_val, u_val, v_val;
|
||||||
@ -129,42 +140,28 @@ namespace cv { namespace cuda { namespace device
|
|||||||
u_plane(y / 2, x / 2) = u_val;
|
u_plane(y / 2, x / 2) = u_val;
|
||||||
v_plane(y / 2, x / 2) = v_val;
|
v_plane(y / 2, x / 2) = v_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst, cudaStream_t stream)
|
|
||||||
{
|
|
||||||
dim3 block(32, 8);
|
|
||||||
dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
|
|
||||||
|
|
||||||
Gray_to_YV12<<<grid, block, 0, stream>>>(src, dst);
|
|
||||||
cudaSafeCall( cudaGetLastError() );
|
|
||||||
|
|
||||||
if (stream == 0)
|
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
|
||||||
}
|
|
||||||
template <int cn>
|
|
||||||
void RGB_to_YV12_caller(const PtrStepSzb src, PtrStepb dst, cudaStream_t stream)
|
|
||||||
{
|
|
||||||
typedef typename TypeVec<uchar, cn>::vec_type src_t;
|
|
||||||
|
|
||||||
dim3 block(32, 8);
|
|
||||||
dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
|
|
||||||
|
|
||||||
RGB_to_YV12<<<grid, block, 0, stream>>>(static_cast< PtrStepSz<src_t> >(src), dst);
|
|
||||||
cudaSafeCall( cudaGetLastError() );
|
|
||||||
|
|
||||||
if (stream == 0)
|
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RGB_to_YV12(const PtrStepSzb src, int cn, PtrStepSzb dst, cudaStream_t stream)
|
void RGB_to_YV12(const GpuMat& src, GpuMat& dst)
|
||||||
{
|
{
|
||||||
typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst, cudaStream_t stream);
|
const dim3 block(32, 8);
|
||||||
|
const dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
|
||||||
|
|
||||||
static const func_t funcs[] =
|
switch (src.channels())
|
||||||
{
|
{
|
||||||
0, Gray_to_YV12_caller, 0, RGB_to_YV12_caller<3>, RGB_to_YV12_caller<4>
|
case 1:
|
||||||
};
|
Gray_to_YV12<<<grid, block>>>(globPtr<uchar>(src), globPtr<uchar>(dst));
|
||||||
|
break;
|
||||||
funcs[cn](src, dst, stream);
|
case 3:
|
||||||
|
RGB_to_YV12<<<grid, block>>>(globPtr<uchar3>(src), globPtr<uchar>(dst));
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
RGB_to_YV12<<<grid, block>>>(globPtr<uchar4>(src), globPtr<uchar>(dst));
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}}}
|
|
||||||
|
CV_CUDEV_SAFE_CALL( cudaGetLastError() );
|
||||||
|
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@ -53,10 +53,7 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>&) {
|
|||||||
|
|
||||||
#else // HAVE_NVCUVID
|
#else // HAVE_NVCUVID
|
||||||
|
|
||||||
namespace cv { namespace cuda { namespace device
|
void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height);
|
||||||
{
|
|
||||||
void NV12_to_RGB(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream = 0);
|
|
||||||
}}}
|
|
||||||
|
|
||||||
using namespace cv::cudacodec::detail;
|
using namespace cv::cudacodec::detail;
|
||||||
|
|
||||||
@ -125,18 +122,6 @@ namespace
|
|||||||
CUvideoctxlock m_lock;
|
CUvideoctxlock m_lock;
|
||||||
};
|
};
|
||||||
|
|
||||||
void cudaPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height)
|
|
||||||
{
|
|
||||||
using namespace cv::cuda::device;
|
|
||||||
|
|
||||||
// Final Stage: NV12toARGB color space conversion
|
|
||||||
|
|
||||||
_outFrame.create(height, width, CV_8UC4);
|
|
||||||
GpuMat outFrame = _outFrame.getGpuMat();
|
|
||||||
|
|
||||||
NV12_to_RGB(decodedFrame, outFrame);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool VideoReaderImpl::nextFrame(OutputArray frame)
|
bool VideoReaderImpl::nextFrame(OutputArray frame)
|
||||||
{
|
{
|
||||||
if (videoSource_->hasError() || videoParser_->hasError())
|
if (videoSource_->hasError() || videoParser_->hasError())
|
||||||
@ -195,7 +180,7 @@ namespace
|
|||||||
|
|
||||||
// perform post processing on the CUDA surface (performs colors space conversion and post processing)
|
// perform post processing on the CUDA surface (performs colors space conversion and post processing)
|
||||||
// comment this out if we inclue the line of code seen above
|
// comment this out if we inclue the line of code seen above
|
||||||
cudaPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight());
|
videoDecPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight());
|
||||||
|
|
||||||
// unmap video frame
|
// unmap video frame
|
||||||
// unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
|
// unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
|
||||||
|
@ -62,10 +62,7 @@ Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const Ptr<EncoderCallBack>&, S
|
|||||||
|
|
||||||
#else // !defined HAVE_CUDA || !defined WIN32
|
#else // !defined HAVE_CUDA || !defined WIN32
|
||||||
|
|
||||||
namespace cv { namespace cuda { namespace device
|
void RGB_to_YV12(const GpuMat& src, GpuMat& dst);
|
||||||
{
|
|
||||||
void RGB_to_YV12(const PtrStepSzb src, int cn, PtrStepSzb dst, cudaStream_t stream = 0);
|
|
||||||
}}}
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// VideoWriterImpl
|
// VideoWriterImpl
|
||||||
@ -642,7 +639,7 @@ namespace
|
|||||||
|
|
||||||
if (inputFormat_ == SF_BGR)
|
if (inputFormat_ == SF_BGR)
|
||||||
{
|
{
|
||||||
device::RGB_to_YV12(frame, frame.channels(), videoFrame_);
|
RGB_to_YV12(frame, videoFrame_);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -72,7 +72,7 @@ template <typename T> struct GlobPtrSz : GlobPtr<T>
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__host__ GlobPtr<T> globPtr(T* data, size_t step)
|
__host__ __device__ GlobPtr<T> globPtr(T* data, size_t step)
|
||||||
{
|
{
|
||||||
GlobPtr<T> p;
|
GlobPtr<T> p;
|
||||||
p.data = data;
|
p.data = data;
|
||||||
@ -81,7 +81,7 @@ __host__ GlobPtr<T> globPtr(T* data, size_t step)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__host__ GlobPtrSz<T> globPtr(T* data, size_t step, int rows, int cols)
|
__host__ __device__ GlobPtrSz<T> globPtr(T* data, size_t step, int rows, int cols)
|
||||||
{
|
{
|
||||||
GlobPtrSz<T> p;
|
GlobPtrSz<T> p;
|
||||||
p.data = data;
|
p.data = data;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user