diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 9ef46a477..fd63ff4dc 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -762,48 +762,10 @@ namespace cv //! smoothes the source image and downsamples it CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); - - struct CV_EXPORTS PyrDownBuf; - - CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, PyrDownBuf& buf, Stream& stream = Stream::Null()); - - struct CV_EXPORTS PyrDownBuf - { - PyrDownBuf() : image_type(-1) {} - PyrDownBuf(Size image_size, int image_type_) : image_type(-1) { create(image_size, image_type_); } - void create(Size image_size, int image_type_); - - private: - friend void pyrDown(const GpuMat&, GpuMat&, PyrDownBuf&, Stream& stream); - - static Mat ker; - GpuMat buf; - Ptr filter; - int image_type; - }; //! upsamples the source image and then smoothes it CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); - struct CV_EXPORTS PyrUpBuf; - - CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, PyrUpBuf& buf, Stream& stream = Stream::Null()); - - struct CV_EXPORTS PyrUpBuf - { - PyrUpBuf() : image_type(-1) {} - PyrUpBuf(Size image_size, int image_type_) : image_type(-1) { create(image_size, image_type_); } - void create(Size image_size, int image_type_); - - private: - friend void pyrUp(const GpuMat&, GpuMat&, PyrUpBuf&, Stream& stream); - - static Mat ker; - GpuMat buf; - Ptr filter; - int image_type; - }; - //! performs linear blending of two images //! to avoid accuracy errors sum of weigths shouldn't be very close to zero CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index 19f9b384c..e5f978f17 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -42,10 +42,8 @@ #include #include "internal_shared.hpp" -#include "opencv2/gpu/device/utility.hpp" using namespace cv::gpu; -using namespace cv::gpu::device; namespace cv { namespace gpu { namespace canny { diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 4252a6be2..eec0fad98 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -42,6 +42,8 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" +#include "opencv2/gpu/device/vec_traits.hpp" +#include "opencv2/gpu/device/vec_math.hpp" using namespace cv::gpu; using namespace cv::gpu::device; @@ -976,6 +978,250 @@ namespace cv { namespace gpu { namespace imgproc template void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream); template void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream); + ////////////////////////////////////////////////////////////////////////// + // pyrDown + + template __global__ void pyrDown(const PtrStep_ src, PtrStep_ dst, const BrdReflect101 rowBrd, const BrdReflect101 colBrd, int dst_cols) + { + typedef typename TypeVec::cn>::vec_type value_type; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y; + + __shared__ value_type smem[256 + 4]; + + value_type sum; + + const int src_y = 2*y; + + sum = VecTraits::all(0); + + sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y - 2))[rowBrd.idx(x)]; + sum = sum + 0.25f * src.ptr(colBrd.idx(src_y - 1))[rowBrd.idx(x)]; + sum = sum + 0.375f * src.ptr(colBrd.idx(src_y ))[rowBrd.idx(x)]; + sum = sum + 0.25f * src.ptr(colBrd.idx(src_y + 1))[rowBrd.idx(x)]; + sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y + 2))[rowBrd.idx(x)]; + + smem[2 + threadIdx.x] = sum; + + if (threadIdx.x < 2) + { + const int left_x = x - 2 + threadIdx.x; + + sum = VecTraits::all(0); + + sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y - 2))[rowBrd.idx(left_x)]; + sum = sum + 0.25f * src.ptr(colBrd.idx(src_y - 1))[rowBrd.idx(left_x)]; + sum = sum + 0.375f * src.ptr(colBrd.idx(src_y ))[rowBrd.idx(left_x)]; + sum = sum + 0.25f * src.ptr(colBrd.idx(src_y + 1))[rowBrd.idx(left_x)]; + sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y + 2))[rowBrd.idx(left_x)]; + + smem[threadIdx.x] = sum; + } + + if (threadIdx.x > 253) + { + const int right_x = x + threadIdx.x + 2; + + sum = VecTraits::all(0); + + sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y - 2))[rowBrd.idx(right_x)]; + sum = sum + 0.25f * src.ptr(colBrd.idx(src_y - 1))[rowBrd.idx(right_x)]; + sum = sum + 0.375f * src.ptr(colBrd.idx(src_y ))[rowBrd.idx(right_x)]; + sum = sum + 0.25f * src.ptr(colBrd.idx(src_y + 1))[rowBrd.idx(right_x)]; + sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y + 2))[rowBrd.idx(right_x)]; + + smem[4 + threadIdx.x] = sum; + } + + __syncthreads(); + + if (threadIdx.x < 128) + { + const int tid2 = threadIdx.x * 2; + + sum = VecTraits::all(0); + + sum = sum + 0.0625f * smem[2 + tid2 - 2]; + sum = sum + 0.25f * smem[2 + tid2 - 1]; + sum = sum + 0.375f * smem[2 + tid2 ]; + sum = sum + 0.25f * smem[2 + tid2 + 1]; + sum = sum + 0.0625f * smem[2 + tid2 + 2]; + + const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2; + + if (dst_x < dst_cols) + dst.ptr(y)[dst_x] = saturate_cast(sum); + } + } + + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) + { + const dim3 block(256); + const dim3 grid(divUp(src.cols, block.x), dst.rows); + + BrdReflect101 rowBrd(src.cols); + BrdReflect101 colBrd(src.rows); + + pyrDown::vec_type><<>>( + static_cast< DevMem2D_::vec_type> >(src), + static_cast< DevMem2D_::vec_type> >(dst), + rowBrd, colBrd, dst.cols); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + ////////////////////////////////////////////////////////////////////////// + // pyrUp + + template __global__ void pyrUp(const PtrStep_ src, DevMem2D_ dst, const BrdReflect101 rowBrd, const BrdReflect101 colBrd) + { + typedef typename TypeVec::cn>::vec_type value_type; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + __shared__ T smem1[10][10]; + __shared__ value_type smem2[20][16]; + + value_type sum; + + if (threadIdx.x < 10 && threadIdx.y < 10) + smem1[threadIdx.y][threadIdx.x] = src.ptr(colBrd.idx(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1))[rowBrd.idx(blockIdx.x * blockDim.x / 2 + threadIdx.x - 1)]; + + __syncthreads(); + + const int tidx = threadIdx.x; + + sum = VecTraits::all(0); + + sum = sum + 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)]; + sum = sum + 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)]; + sum = sum + 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)]; + sum = sum + 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)]; + sum = sum + 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)]; + + smem2[2 + threadIdx.y][tidx] = sum; + + if (threadIdx.y < 2) + { + sum = VecTraits::all(0); + + sum = sum + 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)]; + sum = sum + 0.25f * smem1[0][1 + ((tidx - 1) >> 1)]; + sum = sum + 0.375f * smem1[0][1 + ((tidx ) >> 1)]; + sum = sum + 0.25f * smem1[0][1 + ((tidx + 1) >> 1)]; + sum = sum + 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)]; + + smem2[threadIdx.y][tidx] = sum; + } + + if (threadIdx.y > 13) + { + sum = VecTraits::all(0); + + sum = sum + 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)]; + sum = sum + 0.25f * smem1[9][1 + ((tidx - 1) >> 1)]; + sum = sum + 0.375f * smem1[9][1 + ((tidx ) >> 1)]; + sum = sum + 0.25f * smem1[9][1 + ((tidx + 1) >> 1)]; + sum = sum + 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)]; + + smem2[4 + threadIdx.y][tidx] = sum; + } + + __syncthreads(); + + sum = VecTraits::all(0); + + sum = sum + 0.0625f * smem2[2 + threadIdx.y - 2][tidx]; + sum = sum + 0.25f * smem2[2 + threadIdx.y - 1][tidx]; + sum = sum + 0.375f * smem2[2 + threadIdx.y ][tidx]; + sum = sum + 0.25f * smem2[2 + threadIdx.y + 1][tidx]; + sum = sum + 0.0625f * smem2[2 + threadIdx.y + 2][tidx]; + + if (x < dst.cols && y < dst.rows) + dst.ptr(y)[x] = saturate_cast(sum); + } + + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) + { + const dim3 block(16, 16); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + BrdReflect101 rowBrd(src.cols); + BrdReflect101 colBrd(src.rows); + + pyrUp::vec_type><<>>( + static_cast< DevMem2D_::vec_type> >(src), + static_cast< DevMem2D_::vec_type> >(dst), + rowBrd, colBrd); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); ////////////////////////////////////////////////////////////////////////// // buildWarpMaps diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index de35c3921..f4cd14365 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -93,11 +93,7 @@ void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf& void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::PyrDownBuf::create(Size, int) { throw_nogpu(); } -void cv::gpu::pyrDown(const GpuMat&, GpuMat&, PyrDownBuf&, Stream&) { throw_nogpu(); } void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::PyrUpBuf::create(Size, int) { throw_nogpu(); } -void cv::gpu::pyrUp(const GpuMat&, GpuMat&, PyrUpBuf&, Stream&) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); } @@ -1598,66 +1594,64 @@ void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream) ////////////////////////////////////////////////////////////////////////////// // pyrDown +namespace cv { namespace gpu { namespace imgproc +{ + template void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); +}}} + void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream) { - PyrDownBuf buf; - pyrDown(src, dst, buf, stream); -} + using namespace cv::gpu::imgproc; -cv::Mat cv::gpu::PyrDownBuf::ker; + typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); -void cv::gpu::PyrDownBuf::create(Size image_size, int image_type_) -{ - if (ker.empty() || image_type_ != image_type) - ker = getGaussianKernel(5, 0, std::max(CV_32F, CV_MAT_DEPTH(image_type_))); - - ensureSizeIsEnough(image_size.height, image_size.width, image_type_, buf); - - if (filter.empty() || image_type_ != image_type) + static const func_t funcs[6][4] = { - image_type = image_type_; - filter = createSeparableLinearFilter_GPU(image_type, image_type, ker, ker); - } -} + {pyrDown_gpu, pyrDown_gpu, pyrDown_gpu, pyrDown_gpu}, + {pyrDown_gpu, pyrDown_gpu, pyrDown_gpu, pyrDown_gpu}, + {pyrDown_gpu, pyrDown_gpu, pyrDown_gpu, pyrDown_gpu}, + {pyrDown_gpu, pyrDown_gpu, pyrDown_gpu, pyrDown_gpu}, + {pyrDown_gpu, pyrDown_gpu, pyrDown_gpu, pyrDown_gpu}, + {pyrDown_gpu, pyrDown_gpu, pyrDown_gpu, pyrDown_gpu}, + }; -void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, PyrDownBuf& buf, Stream& stream) -{ - buf.create(src.size(), src.type()); - buf.filter->apply(src, buf.buf, Rect(0, 0, src.cols, src.rows), stream); - downsample(buf.buf, dst, stream); + CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); + + dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type()); + + funcs[src.depth()][src.channels() - 1](src, dst, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// // pyrUp +namespace cv { namespace gpu { namespace imgproc +{ + template void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); +}}} + void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream) { - PyrUpBuf buf; - pyrUp(src, dst, buf, stream); -} + using namespace cv::gpu::imgproc; -cv::Mat cv::gpu::PyrUpBuf::ker; + typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); -void cv::gpu::PyrUpBuf::create(Size image_size, int image_type_) -{ - if (ker.empty() || image_type_ != image_type) - ker = getGaussianKernel(5, 0, std::max(CV_32F, CV_MAT_DEPTH(image_type_))) * 2; - - ensureSizeIsEnough(image_size.height * 2, image_size.width * 2, image_type_, buf); - - if (filter.empty() || image_type_ != image_type) + static const func_t funcs[6][4] = { - image_type = image_type_; - filter = createSeparableLinearFilter_GPU(image_type, image_type, ker, ker); - } -} + {pyrUp_gpu, pyrUp_gpu, pyrUp_gpu, pyrUp_gpu}, + {pyrUp_gpu, pyrUp_gpu, pyrUp_gpu, pyrUp_gpu}, + {pyrUp_gpu, pyrUp_gpu, pyrUp_gpu, pyrUp_gpu}, + {pyrUp_gpu, pyrUp_gpu, pyrUp_gpu, pyrUp_gpu}, + {pyrUp_gpu, pyrUp_gpu, pyrUp_gpu, pyrUp_gpu}, + {pyrUp_gpu, pyrUp_gpu, pyrUp_gpu, pyrUp_gpu}, + }; -void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, PyrUpBuf& buf, Stream& stream) -{ - buf.create(src.size(), src.type()); - upsample(src, buf.buf, stream); - buf.filter->apply(buf.buf, dst, Rect(0, 0, buf.buf.cols, buf.buf.rows), stream); + CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); + + dst.create(src.rows*2, src.cols*2, src.type()); + + funcs[src.depth()][src.channels() - 1](src, dst, StreamAccessor::getStream(stream)); } diff --git a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp index 45185deb4..5f7189f7a 100644 --- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp +++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp @@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device { struct BrdReflect101 { - explicit BrdReflect101(int len): last(len - 1) {} + explicit __host__ __device__ __forceinline__ BrdReflect101(int len): last(len - 1) {} __device__ __forceinline__ int idx_low(int i) const { @@ -67,17 +67,17 @@ namespace cv { namespace gpu { namespace device return idx_low(idx_high(i)); } - bool is_range_safe(int mini, int maxi) const + __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const { return -last <= mini && maxi <= 2 * last; } - int last; + const int last; }; template struct BrdRowReflect101 : BrdReflect101 { - explicit BrdRowReflect101(int len): BrdReflect101(len) {} + explicit __host__ __device__ __forceinline__ BrdRowReflect101(int len): BrdReflect101(len) {} template __device__ __forceinline__ D at_low(int i, const T* data) const { @@ -92,7 +92,7 @@ namespace cv { namespace gpu { namespace device template struct BrdColReflect101 : BrdReflect101 { - BrdColReflect101(int len, size_t step): BrdReflect101(len), step(step) {} + __host__ __device__ __forceinline__ BrdColReflect101(int len, size_t step): BrdReflect101(len), step(step) {} template __device__ __forceinline__ D at_low(int i, const T* data) const { @@ -104,12 +104,12 @@ namespace cv { namespace gpu { namespace device return saturate_cast(*(const D*)((const char*)data + idx_high(i)*step)); } - size_t step; + const size_t step; }; struct BrdReplicate { - explicit BrdReplicate(int len): last(len - 1) {} + explicit __host__ __device__ __forceinline__ BrdReplicate(int len): last(len - 1) {} __device__ __forceinline__ int idx_low(int i) const { @@ -131,12 +131,12 @@ namespace cv { namespace gpu { namespace device return true; } - int last; + const int last; }; template struct BrdRowReplicate : BrdReplicate { - explicit BrdRowReplicate(int len): BrdReplicate(len) {} + explicit __host__ __device__ __forceinline__ BrdRowReplicate(int len): BrdReplicate(len) {} template __device__ __forceinline__ D at_low(int i, const T* data) const { @@ -152,7 +152,7 @@ namespace cv { namespace gpu { namespace device template struct BrdColReplicate : BrdReplicate { - BrdColReplicate(int len, size_t step): BrdReplicate(len), step(step) {} + __host__ __device__ __forceinline__ BrdColReplicate(int len, size_t step): BrdReplicate(len), step(step) {} template __device__ __forceinline__ D at_low(int i, const T* data) const { @@ -164,12 +164,12 @@ namespace cv { namespace gpu { namespace device return saturate_cast(*(const D*)((const char*)data + idx_high(i)*step)); } - size_t step; + const size_t step; }; template struct BrdRowConstant { - explicit BrdRowConstant(int len_, const D& val_ = VecTraits::all(0)): len(len_), val(val_) {} + explicit __host__ __device__ __forceinline__ BrdRowConstant(int len_, const D& val_ = VecTraits::all(0)): len(len_), val(val_) {} template __device__ __forceinline__ D at_low(int i, const T* data) const { @@ -181,18 +181,18 @@ namespace cv { namespace gpu { namespace device return i < len ? saturate_cast(data[i]) : val; } - bool is_range_safe(int mini, int maxi) const + __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const { return true; } - int len; - D val; + const int len; + const D val; }; template struct BrdColConstant { - BrdColConstant(int len_, size_t step_, const D& val_ = VecTraits::all(0)): len(len_), step(step_), val(val_) {} + __host__ __device__ __forceinline__ BrdColConstant(int len_, size_t step_, const D& val_ = VecTraits::all(0)): len(len_), step(step_), val(val_) {} template __device__ __forceinline__ D at_low(int i, const T* data) const { @@ -204,19 +204,19 @@ namespace cv { namespace gpu { namespace device return i < len ? saturate_cast(*(const D*)((const char*)data + i*step)) : val; } - bool is_range_safe(int mini, int maxi) const + __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const { return true; } - int len; - size_t step; - D val; + const int len; + const size_t step; + const D val; }; template struct BrdConstant { - BrdConstant(int w, int h, const OutT &val = VecTraits::all(0)) : w(w), h(h), val(val) {} + __host__ __device__ __forceinline__ BrdConstant(int w, int h, const OutT &val = VecTraits::all(0)) : w(w), h(h), val(val) {} __device__ __forceinline__ OutT at(int x, int y, const uchar* data, int step) const { @@ -225,7 +225,8 @@ namespace cv { namespace gpu { namespace device return val; } - int w, h; + const int w; + const int h; OutT val; }; }}} diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp index 248ca761a..0a198b268 100644 --- a/modules/gpu/test/test_features2d.cpp +++ b/modules/gpu/test/test_features2d.cpp @@ -312,7 +312,7 @@ TEST_P(BruteForceMatcher, MatchAdd) ASSERT_EQ(0, badCount); } -TEST_P(BruteForceMatcher, KnnMatch) +TEST_P(BruteForceMatcher, KnnMatch2) { const char* distStr = dists[distType]; @@ -352,7 +352,47 @@ TEST_P(BruteForceMatcher, KnnMatch) ASSERT_EQ(0, badCount); } -TEST_P(BruteForceMatcher, KnnMatchAdd) +TEST_P(BruteForceMatcher, KnnMatch3) +{ + const char* distStr = dists[distType]; + + PRINT_PARAM(devInfo); + PRINT_PARAM(distStr); + PRINT_PARAM(dim); + + const int knn = 3; + + std::vector< std::vector > matches; + + ASSERT_NO_THROW( + cv::gpu::BruteForceMatcher_GPU_base matcher(distType); + matcher.knnMatch(cv::gpu::GpuMat(query), cv::gpu::GpuMat(train), matches, knn); + ); + + ASSERT_EQ(queryDescCount, matches.size()); + + int badCount = 0; + for (size_t i = 0; i < matches.size(); i++) + { + if ((int)matches[i].size() != knn) + badCount++; + else + { + int localBadCount = 0; + for (int k = 0; k < knn; k++) + { + cv::DMatch match = matches[i][k]; + if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0)) + localBadCount++; + } + badCount += localBadCount > 0 ? 1 : 0; + } + } + + ASSERT_EQ(0, badCount); +} + +TEST_P(BruteForceMatcher, KnnMatchAdd2) { const char* distStr = dists[distType]; @@ -422,6 +462,76 @@ TEST_P(BruteForceMatcher, KnnMatchAdd) ASSERT_EQ(0, badCount); } +TEST_P(BruteForceMatcher, KnnMatchAdd3) +{ + const char* distStr = dists[distType]; + + PRINT_PARAM(devInfo); + PRINT_PARAM(distStr); + PRINT_PARAM(dim); + + const int knn = 3; + std::vector< std::vector > matches; + + bool isMaskSupported; + + ASSERT_NO_THROW( + cv::gpu::BruteForceMatcher_GPU_base matcher(distType); + + cv::gpu::GpuMat d_train(train); + + // make add() twice to test such case + matcher.add(std::vector(1, d_train.rowRange(0, train.rows / 2))); + matcher.add(std::vector(1, d_train.rowRange(train.rows / 2, train.rows))); + + // prepare masks (make first nearest match illegal) + std::vector masks(2); + for (int mi = 0; mi < 2; mi++ ) + { + masks[mi] = cv::gpu::GpuMat(query.rows, train.rows / 2, CV_8UC1, cv::Scalar::all(1)); + for (int di = 0; di < queryDescCount / 2; di++) + masks[mi].col(di * countFactor).setTo(cv::Scalar::all(0)); + } + + matcher.knnMatch(cv::gpu::GpuMat(query), matches, knn, masks); + + isMaskSupported = matcher.isMaskSupported(); + ); + + ASSERT_EQ(queryDescCount, matches.size()); + + int badCount = 0; + int shift = isMaskSupported ? 1 : 0; + for (size_t i = 0; i < matches.size(); i++) + { + if ((int)matches[i].size() != knn) + badCount++; + else + { + int localBadCount = 0; + for (int k = 0; k < knn; k++) + { + cv::DMatch match = matches[i][k]; + { + if (i < queryDescCount / 2) + { + if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) ) + localBadCount++; + } + else + { + if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) ) + localBadCount++; + } + } + } + badCount += localBadCount > 0 ? 1 : 0; + } + } + + ASSERT_EQ(0, badCount); +} + TEST_P(BruteForceMatcher, RadiusMatch) { if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS)) diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index cd9bf8410..ab0114230 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -3959,23 +3959,28 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Blend, testing::Combine( //////////////////////////////////////////////////////// // pyrDown -struct PyrDown : testing::TestWithParam +struct PyrDown : testing::TestWithParam< std::tr1::tuple > { cv::gpu::DeviceInfo devInfo; + int type; + cv::Size size; cv::Mat src; cv::Mat dst_gold; virtual void SetUp() { - devInfo = GetParam(); + devInfo = std::tr1::get<0>(GetParam()); + type = std::tr1::get<1>(GetParam()); + cv::gpu::setDevice(devInfo.deviceID()); - - cv::Mat img = readImage("stereobm/aloe-L.png"); - ASSERT_FALSE(img.empty()); - - img.convertTo(src, CV_16S); + + cv::RNG& rng = cvtest::TS::ptr()->get_rng(); + + size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200)); + + src = cvtest::randomMat(rng, size, type, 0.0, 255.0, false); cv::pyrDown(src, dst_gold); } @@ -3984,6 +3989,8 @@ struct PyrDown : testing::TestWithParam TEST_P(PyrDown, Accuracy) { PRINT_PARAM(devInfo); + PRINT_TYPE(type); + PRINT_PARAM(size); cv::Mat dst; @@ -3998,34 +4005,43 @@ TEST_P(PyrDown, Accuracy) ASSERT_EQ(dst_gold.cols, dst.cols); ASSERT_EQ(dst_gold.rows, dst.rows); ASSERT_EQ(dst_gold.type(), dst.type()); - - double err = cvtest::crossCorr(dst_gold, dst) / - (cv::norm(dst_gold,cv::NORM_L2)*cv::norm(dst,cv::NORM_L2)); + + double err = cvtest::crossCorr(dst_gold, dst) / (cv::norm(dst_gold,cv::NORM_L2)*cv::norm(dst,cv::NORM_L2)); ASSERT_NEAR(err, 1., 1e-2); } -INSTANTIATE_TEST_CASE_P(ImgProc, PyrDown, testing::ValuesIn(devices())); +INSTANTIATE_TEST_CASE_P(ImgProc, PyrDown, testing::Combine( + testing::ValuesIn(devices()), + testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4, + CV_16UC1, CV_16UC2, CV_16UC3, CV_16UC4, + CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, + CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4))); //////////////////////////////////////////////////////// // pyrUp -struct PyrUp: testing::TestWithParam +struct PyrUp: testing::TestWithParam< std::tr1::tuple > { cv::gpu::DeviceInfo devInfo; + int type; + cv::Size size; cv::Mat src; cv::Mat dst_gold; virtual void SetUp() { - devInfo = GetParam(); + devInfo = std::tr1::get<0>(GetParam()); + type = std::tr1::get<1>(GetParam()); + cv::gpu::setDevice(devInfo.deviceID()); - cv::Mat img = readImage("stereobm/aloe-L.png"); - ASSERT_FALSE(img.empty()); - - img.convertTo(src, CV_16S); + cv::RNG& rng = cvtest::TS::ptr()->get_rng(); + + size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200)); + + src = cvtest::randomMat(rng, size, type, 0.0, 255.0, false); cv::pyrUp(src, dst_gold); } @@ -4034,6 +4050,8 @@ struct PyrUp: testing::TestWithParam TEST_P(PyrUp, Accuracy) { PRINT_PARAM(devInfo); + PRINT_TYPE(type); + PRINT_PARAM(size); cv::Mat dst; @@ -4049,12 +4067,17 @@ TEST_P(PyrUp, Accuracy) ASSERT_EQ(dst_gold.rows, dst.rows); ASSERT_EQ(dst_gold.type(), dst.type()); - double err = cvtest::crossCorr(dst_gold, dst) / - (cv::norm(dst_gold,cv::NORM_L2)*cv::norm(dst,cv::NORM_L2)); + double err = cvtest::crossCorr(dst_gold, dst) / (cv::norm(dst_gold,cv::NORM_L2)*cv::norm(dst,cv::NORM_L2)); ASSERT_NEAR(err, 1., 1e-2); } -INSTANTIATE_TEST_CASE_P(ImgProc, PyrUp, testing::ValuesIn(devices())); + +INSTANTIATE_TEST_CASE_P(ImgProc, PyrUp, testing::Combine( + testing::ValuesIn(devices()), + testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4, + CV_16UC1, CV_16UC2, CV_16UC3, CV_16UC4, + CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, + CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4))); //////////////////////////////////////////////////////// // Canny diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index 21e9766ff..b846efd73 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -304,8 +304,8 @@ TEST(BruteForceMatcher) gpu::GpuMat d_train(train); // Output - vector< vector > matches(1); - vector< vector > d_matches(1); + vector< vector > matches(2); + vector< vector > d_matches(2); SUBTEST << "match"; @@ -875,49 +875,269 @@ TEST(GaussianBlur) TEST(pyrDown) { - gpu::PyrDownBuf buf(Size(4000, 4000), CV_16SC3); - - for (int size = 4000; size >= 1000; size -= 1000) { - SUBTEST << "size " << size; + for (int size = 4000; size >= 1000; size -= 1000) + { + SUBTEST << "8UC1, size " << size; - Mat src; gen(src, size, size, CV_16SC3, 0, 256); - Mat dst(Size(src.cols / 2, src.rows / 2), src.type()); + Mat src; gen(src, size, size, CV_8UC1, 0, 256); + Mat dst(Size(src.cols / 2, src.rows / 2), src.type()); - CPU_ON; - pyrDown(src, dst); - CPU_OFF; + CPU_ON; + pyrDown(src, dst); + CPU_OFF; - gpu::GpuMat d_src(src); - gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type()); + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type()); - GPU_ON; - gpu::pyrDown(d_src, d_dst, buf); - GPU_OFF; + GPU_ON; + gpu::pyrDown(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 4000; size >= 1000; size -= 1000) + { + SUBTEST << "8UC3, size " << size; + + Mat src; gen(src, size, size, CV_8UC3, 0, 256); + Mat dst(Size(src.cols / 2, src.rows / 2), src.type()); + + CPU_ON; + pyrDown(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type()); + + GPU_ON; + gpu::pyrDown(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 4000; size >= 1000; size -= 1000) + { + SUBTEST << "8UC4, size " << size; + + Mat src; gen(src, size, size, CV_8UC4, 0, 256); + Mat dst(Size(src.cols / 2, src.rows / 2), src.type()); + + CPU_ON; + pyrDown(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type()); + + GPU_ON; + gpu::pyrDown(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 4000; size >= 1000; size -= 1000) + { + SUBTEST << "16SC3, size " << size; + + Mat src; gen(src, size, size, CV_16SC3, 0, 256); + Mat dst(Size(src.cols / 2, src.rows / 2), src.type()); + + CPU_ON; + pyrDown(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type()); + + GPU_ON; + gpu::pyrDown(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 4000; size >= 1000; size -= 1000) + { + SUBTEST << "32FC1, size " << size; + + Mat src; gen(src, size, size, CV_32FC1, 0, 256); + Mat dst(Size(src.cols / 2, src.rows / 2), src.type()); + + CPU_ON; + pyrDown(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type()); + + GPU_ON; + gpu::pyrDown(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 4000; size >= 1000; size -= 1000) + { + SUBTEST << "32FC3, size " << size; + + Mat src; gen(src, size, size, CV_32FC3, 0, 256); + Mat dst(Size(src.cols / 2, src.rows / 2), src.type()); + + CPU_ON; + pyrDown(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type()); + + GPU_ON; + gpu::pyrDown(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 4000; size >= 1000; size -= 1000) + { + SUBTEST << "32FC4, size " << size; + + Mat src; gen(src, size, size, CV_32FC4, 0, 256); + Mat dst(Size(src.cols / 2, src.rows / 2), src.type()); + + CPU_ON; + pyrDown(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type()); + + GPU_ON; + gpu::pyrDown(d_src, d_dst); + GPU_OFF; + } } } TEST(pyrUp) { - gpu::PyrUpBuf buf(Size(4000, 4000), CV_16SC3); - - for (int size = 4000; size >= 1000; size -= 1000) { - SUBTEST << "size " << size; + for (int size = 2000; size >= 1000; size -= 1000) + { + SUBTEST << "8UC1, size " << size; - Mat src; gen(src, size, size, CV_16SC3, 0, 256); - Mat dst(Size(src.cols * 2, src.rows * 2), src.type()); + Mat src; gen(src, size, size, CV_8UC1, 0, 256); + Mat dst(Size(src.cols * 2, src.rows * 2), src.type()); - CPU_ON; - pyrUp(src, dst); - CPU_OFF; + CPU_ON; + pyrUp(src, dst); + CPU_OFF; - gpu::GpuMat d_src(src); - gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type()); + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type()); - GPU_ON; - gpu::pyrUp(d_src, d_dst, buf); - GPU_OFF; + GPU_ON; + gpu::pyrUp(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 2000; size >= 1000; size -= 1000) + { + SUBTEST << "8UC3, size " << size; + + Mat src; gen(src, size, size, CV_8UC3, 0, 256); + Mat dst(Size(src.cols * 2, src.rows * 2), src.type()); + + CPU_ON; + pyrUp(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type()); + + GPU_ON; + gpu::pyrUp(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 2000; size >= 1000; size -= 1000) + { + SUBTEST << "8UC4, size " << size; + + Mat src; gen(src, size, size, CV_8UC4, 0, 256); + Mat dst(Size(src.cols * 2, src.rows * 2), src.type()); + + CPU_ON; + pyrUp(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type()); + + GPU_ON; + gpu::pyrUp(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 2000; size >= 1000; size -= 1000) + { + SUBTEST << "16SC3, size " << size; + + Mat src; gen(src, size, size, CV_16SC3, 0, 256); + Mat dst(Size(src.cols * 2, src.rows * 2), src.type()); + + CPU_ON; + pyrUp(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type()); + + GPU_ON; + gpu::pyrUp(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 2000; size >= 1000; size -= 1000) + { + SUBTEST << "32FC1, size " << size; + + Mat src; gen(src, size, size, CV_32FC1, 0, 256); + Mat dst(Size(src.cols * 2, src.rows * 2), src.type()); + + CPU_ON; + pyrUp(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type()); + + GPU_ON; + gpu::pyrUp(d_src, d_dst); + GPU_OFF; + } + } + { + for (int size = 2000; size >= 1000; size -= 1000) + { + SUBTEST << "32FC3, size " << size; + + Mat src; gen(src, size, size, CV_32FC3, 0, 256); + Mat dst(Size(src.cols * 2, src.rows * 2), src.type()); + + CPU_ON; + pyrUp(src, dst); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type()); + + GPU_ON; + gpu::pyrUp(d_src, d_dst); + GPU_OFF; + } } }