diff --git a/modules/cudacodec/CMakeLists.txt b/modules/cudacodec/CMakeLists.txt index fd6ed2d1a..1dd4c92e5 100644 --- a/modules/cudacodec/CMakeLists.txt +++ b/modules/cudacodec/CMakeLists.txt @@ -4,9 +4,9 @@ endif() set(the_description "CUDA-accelerated Video Encoding/Decoding") -ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations) +ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef) -ocv_add_module(cudacodec opencv_highgui) +ocv_add_module(cudacodec opencv_highgui OPTIONAL opencv_cudev) ocv_module_include_directories() ocv_glob_module_sources() diff --git a/modules/cudacodec/src/cuda/nv12_to_rgb.cu b/modules/cudacodec/src/cuda/nv12_to_rgb.cu index a6a8c77ea..f45a314c6 100644 --- a/modules/cudacodec/src/cuda/nv12_to_rgb.cu +++ b/modules/cudacodec/src/cuda/nv12_to_rgb.cu @@ -47,13 +47,26 @@ * source and converts to output in ARGB format */ -#include "opencv2/core/cuda/common.hpp" +#include "opencv2/opencv_modules.hpp" -namespace cv { namespace cuda { namespace device +#ifndef HAVE_OPENCV_CUDEV + +#error "opencv_cudev is required" + +#else + +#include "opencv2/cudev/common.hpp" + +using namespace cv; +using namespace cv::cudev; + +void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height); + +namespace { __constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f}; - __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue) + __device__ static void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue) { float luma, chromaCb, chromaCr; @@ -76,7 +89,7 @@ namespace cv { namespace cuda { namespace device (chromaCr * constHueColorSpaceMat[8]); } - __device__ uint RGBA_pack_10bit(float red, float green, float blue, uint alpha) + __device__ static uint RGBA_pack_10bit(float red, float green, float blue, uint alpha) { uint ARGBpixel = 0; @@ -99,9 +112,9 @@ namespace cv { namespace cuda { namespace device #define COLOR_COMPONENT_BIT_SIZE 10 #define COLOR_COMPONENT_MASK 0x3FF - __global__ void NV12_to_RGB(uchar* srcImage, size_t nSourcePitch, - uint* dstImage, size_t nDestPitch, - uint width, uint height) + __global__ void NV12_to_RGB(const uchar* srcImage, size_t nSourcePitch, + uint* dstImage, size_t nDestPitch, + uint width, uint height) { // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1); @@ -171,18 +184,24 @@ namespace cv { namespace cuda { namespace device dstImage[y * dstImagePitch + x ] = RGBA_pack_10bit(red[0], green[0], blue[0], ((uint)0xff << 24)); dstImage[y * dstImagePitch + x + 1 ] = RGBA_pack_10bit(red[1], green[1], blue[1], ((uint)0xff << 24)); } +} - void NV12_to_RGB(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream) - { - dim3 block(32, 8); - dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y)); +void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height) +{ + // Final Stage: NV12toARGB color space conversion - NV12_to_RGB<<<grid, block, 0, stream>>>(decodedFrame.data, decodedFrame.step, interopFrame.data, interopFrame.step, - interopFrame.cols, interopFrame.rows); + _outFrame.create(height, width, CV_8UC4); + GpuMat outFrame = _outFrame.getGpuMat(); - cudaSafeCall( cudaGetLastError() ); + dim3 block(32, 8); + dim3 grid(divUp(width, 2 * block.x), divUp(height, block.y)); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } -}}} + NV12_to_RGB<<<grid, block>>>(decodedFrame.ptr<uchar>(), decodedFrame.step, + outFrame.ptr<uint>(), outFrame.step, + width, height); + + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); +} + +#endif diff --git a/modules/cudacodec/src/cuda/rgb_to_yv12.cu b/modules/cudacodec/src/cuda/rgb_to_yv12.cu index 18a85f562..ed0e0df9b 100644 --- a/modules/cudacodec/src/cuda/rgb_to_yv12.cu +++ b/modules/cudacodec/src/cuda/rgb_to_yv12.cu @@ -40,10 +40,21 @@ // //M*/ -#include "opencv2/core/cuda/common.hpp" -#include "opencv2/core/cuda/vec_traits.hpp" +#include "opencv2/opencv_modules.hpp" -namespace cv { namespace cuda { namespace device +#ifndef HAVE_OPENCV_CUDEV + +#error "opencv_cudev is required" + +#else + +#include "opencv2/cudev/ptr2d/glob.hpp" + +using namespace cv::cudev; + +void RGB_to_YV12(const GpuMat& src, GpuMat& dst); + +namespace { __device__ __forceinline__ void rgb_to_y(const uchar b, const uchar g, const uchar r, uchar& y) { @@ -57,7 +68,7 @@ namespace cv { namespace cuda { namespace device v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100); } - __global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst) + __global__ void Gray_to_YV12(const GlobPtrSz<uchar> src, GlobPtr<uchar> dst) { const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2; const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2; @@ -67,9 +78,9 @@ namespace cv { namespace cuda { namespace device // get pointers to the data const size_t planeSize = src.rows * dst.step; - PtrStepb y_plane(dst.data, dst.step); - PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2); - PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2); + GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step); + GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2); + GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2); uchar pix; uchar y_val, u_val, v_val; @@ -94,7 +105,7 @@ namespace cv { namespace cuda { namespace device } template <typename T> - __global__ void RGB_to_YV12(const PtrStepSz<T> src, PtrStepb dst) + __global__ void RGB_to_YV12(const GlobPtrSz<T> src, GlobPtr<uchar> dst) { const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2; const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2; @@ -104,9 +115,9 @@ namespace cv { namespace cuda { namespace device // get pointers to the data const size_t planeSize = src.rows * dst.step; - PtrStepb y_plane(dst.data, dst.step); - PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2); - PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2); + GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step); + GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2); + GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2); T pix; uchar y_val, u_val, v_val; @@ -129,42 +140,28 @@ namespace cv { namespace cuda { namespace device u_plane(y / 2, x / 2) = u_val; v_plane(y / 2, x / 2) = v_val; } +} - void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst, cudaStream_t stream) +void RGB_to_YV12(const GpuMat& src, GpuMat& dst) +{ + const dim3 block(32, 8); + const dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2)); + + switch (src.channels()) { - dim3 block(32, 8); - dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2)); - - Gray_to_YV12<<<grid, block, 0, stream>>>(src, dst); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - template <int cn> - void RGB_to_YV12_caller(const PtrStepSzb src, PtrStepb dst, cudaStream_t stream) - { - typedef typename TypeVec<uchar, cn>::vec_type src_t; - - dim3 block(32, 8); - dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2)); - - RGB_to_YV12<<<grid, block, 0, stream>>>(static_cast< PtrStepSz<src_t> >(src), dst); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + case 1: + Gray_to_YV12<<<grid, block>>>(globPtr<uchar>(src), globPtr<uchar>(dst)); + break; + case 3: + RGB_to_YV12<<<grid, block>>>(globPtr<uchar3>(src), globPtr<uchar>(dst)); + break; + case 4: + RGB_to_YV12<<<grid, block>>>(globPtr<uchar4>(src), globPtr<uchar>(dst)); + break; } - void RGB_to_YV12(const PtrStepSzb src, int cn, PtrStepSzb dst, cudaStream_t stream) - { - typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst, cudaStream_t stream); + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); +} - static const func_t funcs[] = - { - 0, Gray_to_YV12_caller, 0, RGB_to_YV12_caller<3>, RGB_to_YV12_caller<4> - }; - - funcs[cn](src, dst, stream); - } -}}} +#endif diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp index ede7ef43e..98f0ba1d9 100644 --- a/modules/cudacodec/src/video_reader.cpp +++ b/modules/cudacodec/src/video_reader.cpp @@ -53,10 +53,7 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>&) { #else // HAVE_NVCUVID -namespace cv { namespace cuda { namespace device -{ - void NV12_to_RGB(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream = 0); -}}} +void videoDecPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height); using namespace cv::cudacodec::detail; @@ -125,18 +122,6 @@ namespace CUvideoctxlock m_lock; }; - void cudaPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height) - { - using namespace cv::cuda::device; - - // Final Stage: NV12toARGB color space conversion - - _outFrame.create(height, width, CV_8UC4); - GpuMat outFrame = _outFrame.getGpuMat(); - - NV12_to_RGB(decodedFrame, outFrame); - } - bool VideoReaderImpl::nextFrame(OutputArray frame) { if (videoSource_->hasError() || videoParser_->hasError()) @@ -195,7 +180,7 @@ namespace // perform post processing on the CUDA surface (performs colors space conversion and post processing) // comment this out if we inclue the line of code seen above - cudaPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight()); + videoDecPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight()); // unmap video frame // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding) diff --git a/modules/cudacodec/src/video_writer.cpp b/modules/cudacodec/src/video_writer.cpp index 78183fb9d..15f6732e3 100644 --- a/modules/cudacodec/src/video_writer.cpp +++ b/modules/cudacodec/src/video_writer.cpp @@ -62,10 +62,7 @@ Ptr<VideoWriter> cv::cudacodec::createVideoWriter(const Ptr<EncoderCallBack>&, S #else // !defined HAVE_CUDA || !defined WIN32 -namespace cv { namespace cuda { namespace device -{ - void RGB_to_YV12(const PtrStepSzb src, int cn, PtrStepSzb dst, cudaStream_t stream = 0); -}}} +void RGB_to_YV12(const GpuMat& src, GpuMat& dst); /////////////////////////////////////////////////////////////////////////// // VideoWriterImpl @@ -642,7 +639,7 @@ namespace if (inputFormat_ == SF_BGR) { - device::RGB_to_YV12(frame, frame.channels(), videoFrame_); + RGB_to_YV12(frame, videoFrame_); } else { diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp index 7304a8c7f..738592663 100644 --- a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp +++ b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp @@ -72,7 +72,7 @@ template <typename T> struct GlobPtrSz : GlobPtr<T> }; template <typename T> -__host__ GlobPtr<T> globPtr(T* data, size_t step) +__host__ __device__ GlobPtr<T> globPtr(T* data, size_t step) { GlobPtr<T> p; p.data = data; @@ -81,7 +81,7 @@ __host__ GlobPtr<T> globPtr(T* data, size_t step) } template <typename T> -__host__ GlobPtrSz<T> globPtr(T* data, size_t step, int rows, int cols) +__host__ __device__ GlobPtrSz<T> globPtr(T* data, size_t step, int rows, int cols) { GlobPtrSz<T> p; p.data = data;