diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index e2bb5ded3..7d0074d5f 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -411,7 +411,7 @@ namespace cv CV_EXPORTS Scalar sum(const GpuMat& m); //! finds global minimum and maximum array elements and returns their values - //! supports only CV_8UC1 type + //! supports CV_8UC1 and CV_8UC4 type //! disabled until fix npp bug CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal = 0); @@ -649,6 +649,12 @@ namespace cv //! returns the Gaussian filter engine CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0); + //! returns maximum filter + CV_EXPORTS Ptr getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1)); + + //! returns minimum filter + CV_EXPORTS Ptr getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1)); + //! smooths the image using the normalized box filter //! supports CV_8UC1, CV_8UC4 types CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1)); diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 4104cb0ab..e5eef84fb 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -419,24 +419,55 @@ Scalar cv::gpu::sum(const GpuMat& src) //////////////////////////////////////////////////////////////////////// // minMax +namespace +{ + void minMax_c1(const GpuMat& src, double* minVal, double* maxVal) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Npp8u min_res, max_res; + + nppSafeCall( nppiMinMax_8u_C1R(src.ptr(), src.step, sz, &min_res, &max_res) ); + + if (minVal) + *minVal = min_res; + + if (maxVal) + *maxVal = max_res; + } + + void minMax_c4(const GpuMat& src, double* minVal, double* maxVal) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Npp8u* cuMin = nppsMalloc_8u(4); + Npp8u* cuMax = nppsMalloc_8u(4); + + nppSafeCall( nppiMinMax_8u_C4R(src.ptr(), src.step, sz, cuMin, cuMax) ); + + if (minVal) + cudaMemcpy(minVal, cuMin, 4 * sizeof(Npp8u), cudaMemcpyDeviceToHost); + if (maxVal) + cudaMemcpy(maxVal, cuMax, 4 * sizeof(Npp8u), cudaMemcpyDeviceToHost); + + nppsFree(cuMin); + nppsFree(cuMax); + } +} + void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal) { + typedef void (*minMax_t)(const GpuMat& src, double* minVal, double* maxVal); + static const minMax_t minMax_callers[] = {0, minMax_c1, 0, 0, minMax_c4}; + CV_Assert(!"disabled until fix npp bug"); - CV_Assert(src.type() == CV_8UC1); + CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4); - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Npp8u min_res, max_res; - - nppSafeCall( nppiMinMax_8u_C1R(src.ptr(), src.step, sz, &min_res, &max_res) ); - - if (minVal) - *minVal = min_res; - - if (maxVal) - *maxVal = max_res; + minMax_callers[src.channels()](src, minVal, maxVal); } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu index 3b81eb6a9..0384b1dbf 100644 --- a/modules/gpu/src/cuda/color.cu +++ b/modules/gpu/src/cuda/color.cu @@ -625,9 +625,8 @@ namespace imgproc template struct RGB2YCrCbConverter { - typedef typename TypeVec::vec_t dst_t; - - static __device__ void cvt(const T* src, dst_t& dst, int bidx) + template + static __device__ void cvt(const T* src, D& dst, int bidx) { const int delta = ColorChannel::half() * (1 << yuv_shift); @@ -642,9 +641,8 @@ namespace imgproc }; template<> struct RGB2YCrCbConverter { - typedef typename TypeVec::vec_t dst_t; - - static __device__ void cvt(const float* src, dst_t& dst, int bidx) + template + static __device__ void cvt(const float* src, D& dst, int bidx) { dst.x = src[0] * cYCrCbCoeffs_f[0] + src[1] * cYCrCbCoeffs_f[1] + src[2] * cYCrCbCoeffs_f[2]; dst.y = (src[bidx^2] - dst.x) * cYCrCbCoeffs_f[3] + ColorChannel::half(); @@ -652,11 +650,11 @@ namespace imgproc } }; - template + template __global__ void RGB2YCrCb(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) { typedef typename TypeVec::vec_t src_t; - typedef typename TypeVec::vec_t dst_t; + typedef typename TypeVec::vec_t dst_t; const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; @@ -668,30 +666,28 @@ namespace imgproc RGB2YCrCbConverter::cvt(((const T*)(&src)), dst, bidx); - *(dst_t*)(dst_ + y * dst_step + x * 3 * sizeof(T)) = dst; + *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; } } - template struct YCrCb2RGBConvertor + template struct YCrCb2RGBConvertor { - typedef typename TypeVec::vec_t src_t; - - static __device__ void cvt(const src_t& src, T* dst, int bidx) + template + static __device__ void cvt(const T& src, D* dst, int bidx) { - const int b = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[3], yuv_shift); - const int g = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel::half()) * cYCrCbCoeffs_i[1], yuv_shift); - const int r = src.x + CV_DESCALE((src.y - ColorChannel::half()) * cYCrCbCoeffs_i[0], yuv_shift); + const int b = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[3], yuv_shift); + const int g = src.x + CV_DESCALE((src.z - ColorChannel::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel::half()) * cYCrCbCoeffs_i[1], yuv_shift); + const int r = src.x + CV_DESCALE((src.y - ColorChannel::half()) * cYCrCbCoeffs_i[0], yuv_shift); - dst[bidx] = saturate_cast(b); - dst[1] = saturate_cast(g); - dst[bidx^2] = saturate_cast(r); + dst[bidx] = saturate_cast(b); + dst[1] = saturate_cast(g); + dst[bidx^2] = saturate_cast(r); } }; template <> struct YCrCb2RGBConvertor { - typedef typename TypeVec::vec_t src_t; - - static __device__ void cvt(const src_t& src, float* dst, int bidx) + template + static __device__ void cvt(const T& src, float* dst, int bidx) { dst[bidx] = src.x + (src.z - ColorChannel::half()) * cYCrCbCoeffs_f[3]; dst[1] = src.x + (src.z - ColorChannel::half()) * cYCrCbCoeffs_f[2] + (src.y - ColorChannel::half()) * cYCrCbCoeffs_f[1]; @@ -699,10 +695,10 @@ namespace imgproc } }; - template + template __global__ void YCrCb2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) { - typedef typename TypeVec::vec_t src_t; + typedef typename TypeVec::vec_t src_t; typedef typename TypeVec::vec_t dst_t; const int x = blockDim.x * blockIdx.x + threadIdx.x; @@ -710,7 +706,7 @@ namespace imgproc if (y < rows && x < cols) { - src_t src = *(const src_t*)(src_ + y * src_step + x * 3 * sizeof(T)); + src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); dst_t dst; YCrCb2RGBConvertor::cvt(src, ((T*)(&dst)), bidx); @@ -723,7 +719,7 @@ namespace imgproc namespace cv { namespace gpu { namespace improc { - template + template void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { dim3 threads(32, 8, 1); @@ -732,53 +728,56 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::RGB2YCrCb<<>>(src.ptr, src.step, + imgproc::RGB2YCrCb<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } - void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, const int* coeffs, cudaStream_t stream) + void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream) { typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); - static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2] = + static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = { - RGB2YCrCb_caller, RGB2YCrCb_caller + {RGB2YCrCb_caller, RGB2YCrCb_caller}, + {RGB2YCrCb_caller, RGB2YCrCb_caller} }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); - RGB2YCrCb_callers[srccn-3](src, dst, bidx, stream); + RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } - void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, const int* coeffs, cudaStream_t stream) + void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream) { typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); - static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2] = + static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = { - RGB2YCrCb_caller, RGB2YCrCb_caller + {RGB2YCrCb_caller, RGB2YCrCb_caller}, + {RGB2YCrCb_caller, RGB2YCrCb_caller} }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); - RGB2YCrCb_callers[srccn-3](src, dst, bidx, stream); + RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } - void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, const float* coeffs, cudaStream_t stream) + void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream) { typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); - static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2] = + static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = { - RGB2YCrCb_caller, RGB2YCrCb_caller + {RGB2YCrCb_caller, RGB2YCrCb_caller}, + {RGB2YCrCb_caller, RGB2YCrCb_caller} }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) ); - RGB2YCrCb_callers[srccn-3](src, dst, bidx, stream); + RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } - template + template void YCrCb2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) { dim3 threads(32, 8, 1); @@ -787,50 +786,53 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::YCrCb2RGB<<>>(src.ptr, src.step, + imgproc::YCrCb2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols, bidx); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } - void YCrCb2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream) + void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream) { typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); - static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2] = + static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = { - YCrCb2RGB_caller, YCrCb2RGB_caller + {YCrCb2RGB_caller, YCrCb2RGB_caller}, + {YCrCb2RGB_caller, YCrCb2RGB_caller} }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); - YCrCb2RGB_callers[dstcn-3](src, dst, bidx, stream); + YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } - void YCrCb2RGB_gpu_16u(const DevMem2D& src, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream) + void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream) { typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); - static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2] = + static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = { - YCrCb2RGB_caller, YCrCb2RGB_caller + {YCrCb2RGB_caller, YCrCb2RGB_caller}, + {YCrCb2RGB_caller, YCrCb2RGB_caller} }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); - YCrCb2RGB_callers[dstcn-3](src, dst, bidx, stream); + YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } - void YCrCb2RGB_gpu_32f(const DevMem2D& src, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream) + void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream) { typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream); - static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2] = + static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = { - YCrCb2RGB_caller, YCrCb2RGB_caller + {YCrCb2RGB_caller, YCrCb2RGB_caller}, + {YCrCb2RGB_caller, YCrCb2RGB_caller} }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) ); - YCrCb2RGB_callers[dstcn-3](src, dst, bidx, stream); + YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); } }}} @@ -843,75 +845,29 @@ namespace imgproc template struct RGB2XYZConvertor { - typedef typename TypeVec::vec_t dst_t; - static __device__ dst_t cvt(const T* src) + template + static __device__ void cvt(const T* src, D& dst) { - dst_t dst; - dst.x = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift)); dst.y = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift)); dst.z = saturate_cast(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift)); - - return dst; } }; template <> struct RGB2XYZConvertor { - typedef typename TypeVec::vec_t dst_t; - static __device__ dst_t cvt(const float* src) + template + static __device__ void cvt(const float* src, D& dst) { - dst_t dst; - dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2]; dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5]; dst.z = src[0] * cXYZ_D65f[6] + src[1] * cXYZ_D65f[7] + src[2] * cXYZ_D65f[8]; - - return dst; } }; - template + template __global__ void RGB2XYZ(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) { typedef typename TypeVec::vec_t src_t; - typedef typename TypeVec::vec_t dst_t; - - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (y < rows && x < cols) - { - src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); - - *(dst_t*)(dst_ + y * dst_step + x * 3 * sizeof(T)) = RGB2XYZConvertor::cvt((const T*)(&src)); - } - } - - template struct XYZ2RGBConvertor - { - typedef typename TypeVec::vec_t src_t; - static __device__ void cvt(const src_t& src, T* dst) - { - dst[0] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift)); - dst[1] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift)); - dst[2] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift)); - } - }; - template <> struct XYZ2RGBConvertor - { - typedef typename TypeVec::vec_t src_t; - static __device__ void cvt(const src_t& src, float* dst) - { - dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2]; - dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5]; - dst[2] = src.x * cXYZ_D65f[6] + src.y * cXYZ_D65f[7] + src.z * cXYZ_D65f[8]; - } - }; - - template - __global__ void XYZ2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) - { - typedef typename TypeVec::vec_t src_t; typedef typename TypeVec::vec_t dst_t; const int x = blockDim.x * blockIdx.x + threadIdx.x; @@ -919,7 +875,48 @@ namespace imgproc if (y < rows && x < cols) { - src_t src = *(const src_t*)(src_ + y * src_step + x * 3 * sizeof(T)); + src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); + + dst_t dst; + RGB2XYZConvertor::cvt((const T*)(&src), dst); + + *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; + } + } + + template struct XYZ2RGBConvertor + { + template + static __device__ void cvt(const T& src, D* dst) + { + dst[0] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift)); + dst[1] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift)); + dst[2] = saturate_cast(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift)); + } + }; + template <> struct XYZ2RGBConvertor + { + template + static __device__ void cvt(const T& src, float* dst) + { + dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2]; + dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5]; + dst[2] = src.x * cXYZ_D65f[6] + src.y * cXYZ_D65f[7] + src.z * cXYZ_D65f[8]; + } + }; + + template + __global__ void XYZ2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) + { + typedef typename TypeVec::vec_t src_t; + typedef typename TypeVec::vec_t dst_t; + + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (y < rows && x < cols) + { + src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T)); dst_t dst; XYZ2RGBConvertor::cvt(src, (T*)(&dst)); @@ -932,7 +929,7 @@ namespace imgproc namespace cv { namespace gpu { namespace improc { - template + template void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) { dim3 threads(32, 8, 1); @@ -941,44 +938,56 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::RGB2XYZ<<>>(src.ptr, src.step, + imgproc::RGB2XYZ<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } - void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, const int* coeffs, cudaStream_t stream) + void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream) { typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); - static const RGB2XYZ_caller_t RGB2XYZ_callers[] = {RGB2XYZ_caller, RGB2XYZ_caller}; + static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = + { + {RGB2XYZ_caller, RGB2XYZ_caller}, + {RGB2XYZ_caller, RGB2XYZ_caller} + }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); - RGB2XYZ_callers[srccn-3](src, dst, stream); + RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); } - void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, const int* coeffs, cudaStream_t stream) + void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream) { typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); - static const RGB2XYZ_caller_t RGB2XYZ_callers[] = {RGB2XYZ_caller, RGB2XYZ_caller}; + static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = + { + {RGB2XYZ_caller, RGB2XYZ_caller}, + {RGB2XYZ_caller, RGB2XYZ_caller} + }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); - RGB2XYZ_callers[srccn-3](src, dst, stream); + RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); } - void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, const float* coeffs, cudaStream_t stream) + void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream) { typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); - static const RGB2XYZ_caller_t RGB2XYZ_callers[] = {RGB2XYZ_caller, RGB2XYZ_caller}; + static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = + { + {RGB2XYZ_caller, RGB2XYZ_caller}, + {RGB2XYZ_caller, RGB2XYZ_caller} + }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); - RGB2XYZ_callers[srccn-3](src, dst, stream); + RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); } - template + template void XYZ2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) { dim3 threads(32, 8, 1); @@ -987,41 +996,53 @@ namespace cv { namespace gpu { namespace improc grid.x = divUp(src.cols, threads.x); grid.y = divUp(src.rows, threads.y); - imgproc::XYZ2RGB<<>>(src.ptr, src.step, + imgproc::XYZ2RGB<<>>(src.ptr, src.step, dst.ptr, dst.step, src.rows, src.cols); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } - void XYZ2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream) + void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream) { typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); - static const XYZ2RGB_caller_t XYZ2RGB_callers[] = {XYZ2RGB_caller, XYZ2RGB_caller}; + static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = + { + {XYZ2RGB_caller, XYZ2RGB_caller}, + {XYZ2RGB_caller, XYZ2RGB_caller} + }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); - XYZ2RGB_callers[dstcn-3](src, dst, stream); + XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); } - void XYZ2RGB_gpu_16u(const DevMem2D& src, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream) + void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream) { typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); - static const XYZ2RGB_caller_t XYZ2RGB_callers[] = {XYZ2RGB_caller, XYZ2RGB_caller}; + static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = + { + {XYZ2RGB_caller, XYZ2RGB_caller}, + {XYZ2RGB_caller, XYZ2RGB_caller} + }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); - XYZ2RGB_callers[dstcn-3](src, dst, stream); + XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); } - void XYZ2RGB_gpu_32f(const DevMem2D& src, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream) + void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream) { typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); - static const XYZ2RGB_caller_t XYZ2RGB_callers[] = {XYZ2RGB_caller, XYZ2RGB_caller}; + static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = + { + {XYZ2RGB_caller, XYZ2RGB_caller}, + {XYZ2RGB_caller, XYZ2RGB_caller} + }; cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); - XYZ2RGB_callers[dstcn-3](src, dst, stream); + XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); } }}} diff --git a/modules/gpu/src/filtering_npp.cpp b/modules/gpu/src/filtering_npp.cpp index 1ca51e152..97f08a760 100644 --- a/modules/gpu/src/filtering_npp.cpp +++ b/modules/gpu/src/filtering_npp.cpp @@ -63,6 +63,9 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int, int, const Gpu Ptr cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createDerivFilter_GPU(int, int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createGaussianFilter_GPU(int, Size, double, double) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } + void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point) { throw_nogpu(); } void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } @@ -105,20 +108,20 @@ namespace int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1; if (nDivisor) *nDivisor = scale; - Mat cont_krnl = (kernel.isContinuous() ? kernel : kernel.clone()).reshape(1, 1); - Mat temp; - cont_krnl.convertTo(temp, type, scale); + Mat temp(kernel.size(), type); + kernel.convertTo(temp, type, scale); + Mat cont_krnl = temp.reshape(1, 1); if (reverse) { - int count = temp.cols >> 1; + int count = cont_krnl.cols >> 1; for (int i = 0; i < count; ++i) { - std::swap(temp.at(0, i), temp.at(0, temp.cols - 1 - i)); + std::swap(cont_krnl.at(0, i), cont_krnl.at(0, cont_krnl.cols - 1 - i)); } } - gpu_krnl.upload(temp); + gpu_krnl.upload(cont_krnl); } } @@ -785,4 +788,58 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si f->apply(src, dst); } +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Image Rank Filter + +namespace +{ + typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, + NppiSize oMaskSize, NppiPoint oAnchor); + + class NPPRankFilter : public BaseFilter_GPU + { + public: + NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {} + + virtual void operator()(const GpuMat& src, GpuMat& dst) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + NppiSize oKernelSize; + oKernelSize.height = ksize.height; + oKernelSize.width = ksize.width; + NppiPoint oAnchor; + oAnchor.x = anchor.x; + oAnchor.y = anchor.y; + + nppSafeCall( func(src.ptr(), src.step, dst.ptr(), dst.step, sz, oKernelSize, oAnchor) ); + } + + nppFilterRank_t func; + }; +} + +Ptr cv::gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor) +{ + static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R}; + + CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType); + + normalizeAnchor(anchor, ksize); + + return Ptr(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)])); +} + +Ptr cv::gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor) +{ + static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R}; + + CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType); + + normalizeAnchor(anchor, ksize); + + return Ptr(new NPPRankFilter(ksize, anchor, nppFilterRank_callers[CV_MAT_CN(srcType)])); +} + #endif diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 9c88f3ea7..0badf8b29 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -98,21 +98,21 @@ namespace cv { namespace gpu void RGB2Gray_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream); void RGB5x52Gray_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, cudaStream_t stream); - void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, const int* coeffs, cudaStream_t stream); - void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, const int* coeffs, cudaStream_t stream); - void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, const float* coeffs, cudaStream_t stream); + void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream); + void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream); + void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream); - void YCrCb2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream); - void YCrCb2RGB_gpu_16u(const DevMem2D& src, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream); - void YCrCb2RGB_gpu_32f(const DevMem2D& src, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream); + void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream); + void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const int* coeffs, cudaStream_t stream); + void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const float* coeffs, cudaStream_t stream); - void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, const int* coeffs, cudaStream_t stream); - void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, const int* coeffs, cudaStream_t stream); - void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, const float* coeffs, cudaStream_t stream); + void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream); + void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream); + void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream); - void XYZ2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream); - void XYZ2RGB_gpu_16u(const DevMem2D& src, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream); - void XYZ2RGB_gpu_32f(const DevMem2D& src, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream); + void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream); + void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream); + void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream); } }} @@ -361,7 +361,8 @@ namespace case CV_BGR2YCrCb: case CV_RGB2YCrCb: case CV_BGR2YUV: case CV_RGB2YUV: { - CV_Assert( scn == 3 || scn == 4 ); + if(dcn <= 0) dcn = 3; + CV_Assert( (scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) ); bidx = code == CV_BGR2YCrCb || code == CV_RGB2YUV ? 0 : 2; @@ -382,14 +383,14 @@ namespace std::swap(coeffs_i[0], coeffs_i[2]); } - out.create(sz, CV_MAKETYPE(depth, 3)); + out.create(sz, CV_MAKETYPE(depth, dcn)); if( depth == CV_8U ) - improc::RGB2YCrCb_gpu_8u(src, scn, out, bidx, coeffs_i, stream); + improc::RGB2YCrCb_gpu_8u(src, scn, out, dcn, bidx, coeffs_i, stream); else if( depth == CV_16U ) - improc::RGB2YCrCb_gpu_16u(src, scn, out, bidx, coeffs_i, stream); + improc::RGB2YCrCb_gpu_16u(src, scn, out, dcn, bidx, coeffs_i, stream); else - improc::RGB2YCrCb_gpu_32f(src, scn, out, bidx, coeffs_f, stream); + improc::RGB2YCrCb_gpu_32f(src, scn, out, dcn, bidx, coeffs_f, stream); } break; @@ -398,7 +399,7 @@ namespace { if (dcn <= 0) dcn = 3; - CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) ); + CV_Assert( (scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) ); bidx = code == CV_YCrCb2BGR || code == CV_YUV2RGB ? 0 : 2; @@ -414,17 +415,18 @@ namespace out.create(sz, CV_MAKETYPE(depth, dcn)); if( depth == CV_8U ) - improc::YCrCb2RGB_gpu_8u(src, out, dcn, bidx, coeffs_i, stream); + improc::YCrCb2RGB_gpu_8u(src, scn, out, dcn, bidx, coeffs_i, stream); else if( depth == CV_16U ) - improc::YCrCb2RGB_gpu_16u(src, out, dcn, bidx, coeffs_i, stream); + improc::YCrCb2RGB_gpu_16u(src, scn, out, dcn, bidx, coeffs_i, stream); else - improc::YCrCb2RGB_gpu_32f(src, out, dcn, bidx, coeffs_f, stream); + improc::YCrCb2RGB_gpu_32f(src, scn, out, dcn, bidx, coeffs_f, stream); } break; case CV_BGR2XYZ: case CV_RGB2XYZ: - { - CV_Assert( scn == 3 || scn == 4 ); + { + if(dcn <= 0) dcn = 3; + CV_Assert( (scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) ); bidx = code == CV_BGR2XYZ ? 0 : 2; @@ -457,21 +459,21 @@ namespace std::swap(coeffs_i[6], coeffs_i[8]); } - out.create(sz, CV_MAKETYPE(depth, 3)); + out.create(sz, CV_MAKETYPE(depth, dcn)); if( depth == CV_8U ) - improc::RGB2XYZ_gpu_8u(src, scn, out, coeffs_i, stream); + improc::RGB2XYZ_gpu_8u(src, scn, out, dcn, coeffs_i, stream); else if( depth == CV_16U ) - improc::RGB2XYZ_gpu_16u(src, scn, out, coeffs_i, stream); + improc::RGB2XYZ_gpu_16u(src, scn, out, dcn, coeffs_i, stream); else - improc::RGB2XYZ_gpu_32f(src, scn, out, coeffs_f, stream); + improc::RGB2XYZ_gpu_32f(src, scn, out, dcn, coeffs_f, stream); } break; case CV_XYZ2BGR: case CV_XYZ2RGB: { if (dcn <= 0) dcn = 3; - CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) ); + CV_Assert( (scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) ); bidx = code == CV_XYZ2BGR ? 0 : 2; static const float XYZ2sRGB_D65f[] = @@ -506,11 +508,11 @@ namespace out.create(sz, CV_MAKETYPE(depth, dcn)); if( depth == CV_8U ) - improc::XYZ2RGB_gpu_8u(src, out, dcn, coeffs_i, stream); + improc::XYZ2RGB_gpu_8u(src, scn, out, dcn, coeffs_i, stream); else if( depth == CV_16U ) - improc::XYZ2RGB_gpu_16u(src, out, dcn, coeffs_i, stream); + improc::XYZ2RGB_gpu_16u(src, scn, out, dcn, coeffs_i, stream); else - improc::XYZ2RGB_gpu_32f(src, out, dcn, coeffs_f, stream); + improc::XYZ2RGB_gpu_32f(src, scn, out, dcn, coeffs_f, stream); } break;