updated image for StereoConstantSpaceBP regression test
updated gpu tests for CornerHarris and CornerMinEigen moved direct convolution implementation to gpu::filter2D, gpu::convolve now use only DFT-based algorithm (Bug #1639)
This commit is contained in:
@@ -904,79 +904,49 @@ namespace cv { namespace gpu { namespace device
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// convolve
|
||||
// filter2D
|
||||
|
||||
#define CONVOLVE_MAX_KERNEL_SIZE 17
|
||||
#define FILTER2D_MAX_KERNEL_SIZE 16
|
||||
|
||||
__constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];
|
||||
__constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
|
||||
|
||||
__global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> filter2DTex(0, cudaFilterModePoint, cudaAddressModeBorder);
|
||||
|
||||
__global__ void filter2D(int ofsX, int ofsY, DevMem2Df dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
|
||||
{
|
||||
__shared__ float smem[16 + 2 * 8][16 + 2 * 8];
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
// x | x 0 | 0
|
||||
// -----------
|
||||
// x | x 0 | 0
|
||||
// 0 | 0 0 | 0
|
||||
// -----------
|
||||
// 0 | 0 0 | 0
|
||||
smem[threadIdx.y][threadIdx.x] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
|
||||
if (x >= dst.cols || y >= dst.rows)
|
||||
return;
|
||||
|
||||
// 0 | 0 x | x
|
||||
// -----------
|
||||
// 0 | 0 x | x
|
||||
// 0 | 0 0 | 0
|
||||
// -----------
|
||||
// 0 | 0 0 | 0
|
||||
smem[threadIdx.y][threadIdx.x + 16] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(x + 8, src.cols - 1)];
|
||||
float res = 0;
|
||||
|
||||
// 0 | 0 0 | 0
|
||||
// -----------
|
||||
// 0 | 0 0 | 0
|
||||
// x | x 0 | 0
|
||||
// -----------
|
||||
// x | x 0 | 0
|
||||
smem[threadIdx.y + 16][threadIdx.x] = src.ptr(::min(y + 8, src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
|
||||
const int baseX = ofsX + x - anchorX;
|
||||
const int baseY = ofsY + y - anchorY;
|
||||
|
||||
// 0 | 0 0 | 0
|
||||
// -----------
|
||||
// 0 | 0 0 | 0
|
||||
// 0 | 0 x | x
|
||||
// -----------
|
||||
// 0 | 0 x | x
|
||||
smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(::min(y + 8, src.rows - 1))[::min(x + 8, src.cols - 1)];
|
||||
int kInd = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (x < src.cols && y < src.rows)
|
||||
for (int i = 0; i < kHeight; ++i)
|
||||
{
|
||||
float res = 0;
|
||||
|
||||
for (int i = 0; i < kHeight; ++i)
|
||||
{
|
||||
for (int j = 0; j < kWidth; ++j)
|
||||
{
|
||||
res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];
|
||||
}
|
||||
}
|
||||
|
||||
dst.ptr(y)[x] = res;
|
||||
for (int j = 0; j < kWidth; ++j)
|
||||
res += tex2D(filter2DTex, baseX + j, baseY + i) * c_filter2DKernel[kInd++];
|
||||
}
|
||||
|
||||
dst.ptr(y)[x] = res;
|
||||
}
|
||||
|
||||
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)
|
||||
void filter2D_gpu(DevMem2Df src, int ofsX, int ofsY, DevMem2Df dst, int kWidth, int kHeight, int anchorX, int anchorY, float* kernel, cudaStream_t stream)
|
||||
{
|
||||
cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
|
||||
cudaSafeCall(cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
|
||||
|
||||
const dim3 block(16, 16);
|
||||
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
|
||||
convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);
|
||||
bindTexture(&filter2DTex, src);
|
||||
|
||||
filter2D<<<grid, block, 0, stream>>>(ofsX, ofsY, dst, kWidth, kHeight, anchorX, anchorY);
|
||||
cudaSafeCall(cudaGetLastError());
|
||||
|
||||
if (stream == 0)
|
||||
|
@@ -659,6 +659,14 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Linear Filter
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace imgproc
|
||||
{
|
||||
void filter2D_gpu(DevMem2Df src, int ofsX, int ofsY, DevMem2Df dst, int kWidth, int kHeight, int anchorX, int anchorY, float* kernel, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
|
||||
@@ -696,20 +704,56 @@ namespace
|
||||
Npp32s nDivisor;
|
||||
nppFilter2D_t func;
|
||||
};
|
||||
|
||||
struct GpuLinearFilter : public BaseFilter_GPU
|
||||
{
|
||||
GpuLinearFilter(Size ksize_, Point anchor_, const GpuMat& kernel_) :
|
||||
BaseFilter_GPU(ksize_, anchor_), kernel(kernel_) {}
|
||||
|
||||
virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
|
||||
{
|
||||
using namespace cv::gpu::device::imgproc;
|
||||
|
||||
Point ofs;
|
||||
Size wholeSize;
|
||||
src.locateROI(wholeSize, ofs);
|
||||
GpuMat srcWhole(wholeSize, src.type(), src.datastart);
|
||||
|
||||
filter2D_gpu(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
GpuMat kernel;
|
||||
};
|
||||
}
|
||||
|
||||
Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, Point anchor)
|
||||
{
|
||||
static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
|
||||
CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_32FC1);
|
||||
CV_Assert(dstType == srcType);
|
||||
|
||||
CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
|
||||
if (srcType == CV_32FC1)
|
||||
{
|
||||
CV_Assert(ksize.width * ksize.height <= 16 * 16);
|
||||
|
||||
GpuMat gpu_krnl;
|
||||
normalizeKernel(kernel, gpu_krnl, CV_32F);
|
||||
|
||||
normalizeAnchor(anchor, ksize);
|
||||
|
||||
return Ptr<BaseFilter_GPU>(new GpuLinearFilter(ksize, anchor, gpu_krnl));
|
||||
}
|
||||
else
|
||||
{
|
||||
static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
|
||||
|
||||
GpuMat gpu_krnl;
|
||||
int nDivisor;
|
||||
normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
|
||||
normalizeAnchor(anchor, ksize);
|
||||
GpuMat gpu_krnl;
|
||||
int nDivisor;
|
||||
normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
|
||||
|
||||
return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
|
||||
normalizeAnchor(anchor, ksize);
|
||||
|
||||
return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
|
||||
}
|
||||
}
|
||||
|
||||
Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor)
|
||||
@@ -729,7 +773,8 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
|
||||
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
|
||||
|
||||
Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst.type(), kernel, anchor);
|
||||
f->apply(src, dst, Rect(0, 0, -1, -1), stream);
|
||||
|
||||
f->apply(src, dst, src.type() == CV_32FC1 ? Rect(0, 0, src.cols, src.rows) : Rect(0, 0, -1, -1), stream);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@@ -1673,137 +1673,82 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
|
||||
convolve(image, templ, result, ccorr, buf);
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace imgproc
|
||||
{
|
||||
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
|
||||
{
|
||||
using namespace ::cv::gpu::device::imgproc;
|
||||
|
||||
#ifndef HAVE_CUFFT
|
||||
|
||||
CV_Assert(image.type() == CV_32F);
|
||||
CV_Assert(templ.type() == CV_32F);
|
||||
CV_Assert(templ.cols <= 17 && templ.rows <= 17);
|
||||
|
||||
result.create(image.size(), CV_32F);
|
||||
|
||||
GpuMat& contKernel = buf.templ_block;
|
||||
|
||||
if (templ.isContinuous())
|
||||
contKernel = templ;
|
||||
else
|
||||
{
|
||||
contKernel = createContinuous(templ.size(), templ.type());
|
||||
|
||||
if (stream)
|
||||
stream.enqueueCopy(templ, contKernel);
|
||||
else
|
||||
templ.copyTo(contKernel);
|
||||
}
|
||||
|
||||
convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
|
||||
|
||||
throw_nogpu();
|
||||
#else
|
||||
|
||||
StaticAssert<sizeof(float) == sizeof(cufftReal)>::check();
|
||||
StaticAssert<sizeof(float) * 2 == sizeof(cufftComplex)>::check();
|
||||
|
||||
CV_Assert(image.type() == CV_32F);
|
||||
CV_Assert(templ.type() == CV_32F);
|
||||
|
||||
if (templ.cols < 13 && templ.rows < 13)
|
||||
buf.create(image.size(), templ.size());
|
||||
result.create(buf.result_size, CV_32F);
|
||||
|
||||
Size& block_size = buf.block_size;
|
||||
Size& dft_size = buf.dft_size;
|
||||
|
||||
GpuMat& image_block = buf.image_block;
|
||||
GpuMat& templ_block = buf.templ_block;
|
||||
GpuMat& result_data = buf.result_data;
|
||||
|
||||
GpuMat& image_spect = buf.image_spect;
|
||||
GpuMat& templ_spect = buf.templ_spect;
|
||||
GpuMat& result_spect = buf.result_spect;
|
||||
|
||||
cufftHandle planR2C, planC2R;
|
||||
cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
|
||||
cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
|
||||
|
||||
cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
|
||||
cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
|
||||
|
||||
GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
|
||||
copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
|
||||
templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
|
||||
|
||||
cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
|
||||
templ_spect.ptr<cufftComplex>()));
|
||||
|
||||
// Process all blocks of the result matrix
|
||||
for (int y = 0; y < result.rows; y += block_size.height)
|
||||
{
|
||||
result.create(image.size(), CV_32F);
|
||||
|
||||
GpuMat& contKernel = buf.templ_block;
|
||||
|
||||
if (templ.isContinuous())
|
||||
contKernel = templ;
|
||||
else
|
||||
for (int x = 0; x < result.cols; x += block_size.width)
|
||||
{
|
||||
contKernel = createContinuous(templ.size(), templ.type());
|
||||
Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
|
||||
std::min(y + dft_size.height, image.rows) - y);
|
||||
GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
|
||||
image.step);
|
||||
copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
|
||||
0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
|
||||
|
||||
cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
|
||||
image_spect.ptr<cufftComplex>()));
|
||||
mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
|
||||
1.f / dft_size.area(), ccorr, stream);
|
||||
cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
|
||||
result_data.ptr<cufftReal>()));
|
||||
|
||||
Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
|
||||
std::min(y + block_size.height, result.rows) - y);
|
||||
GpuMat result_roi(result_roi_size, result.type(),
|
||||
(void*)(result.ptr<float>(y) + x), result.step);
|
||||
GpuMat result_block(result_roi_size, result_data.type(),
|
||||
result_data.ptr(), result_data.step);
|
||||
|
||||
if (stream)
|
||||
stream.enqueueCopy(templ, contKernel);
|
||||
stream.enqueueCopy(result_block, result_roi);
|
||||
else
|
||||
templ.copyTo(contKernel);
|
||||
result_block.copyTo(result_roi);
|
||||
}
|
||||
|
||||
convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
|
||||
}
|
||||
else
|
||||
{
|
||||
buf.create(image.size(), templ.size());
|
||||
result.create(buf.result_size, CV_32F);
|
||||
|
||||
Size& block_size = buf.block_size;
|
||||
Size& dft_size = buf.dft_size;
|
||||
|
||||
GpuMat& image_block = buf.image_block;
|
||||
GpuMat& templ_block = buf.templ_block;
|
||||
GpuMat& result_data = buf.result_data;
|
||||
|
||||
GpuMat& image_spect = buf.image_spect;
|
||||
GpuMat& templ_spect = buf.templ_spect;
|
||||
GpuMat& result_spect = buf.result_spect;
|
||||
|
||||
cufftHandle planR2C, planC2R;
|
||||
cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
|
||||
cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
|
||||
|
||||
cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
|
||||
cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
|
||||
|
||||
GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
|
||||
copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
|
||||
templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
|
||||
|
||||
cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
|
||||
templ_spect.ptr<cufftComplex>()));
|
||||
|
||||
// Process all blocks of the result matrix
|
||||
for (int y = 0; y < result.rows; y += block_size.height)
|
||||
{
|
||||
for (int x = 0; x < result.cols; x += block_size.width)
|
||||
{
|
||||
Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
|
||||
std::min(y + dft_size.height, image.rows) - y);
|
||||
GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
|
||||
image.step);
|
||||
copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
|
||||
0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
|
||||
|
||||
cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
|
||||
image_spect.ptr<cufftComplex>()));
|
||||
mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
|
||||
1.f / dft_size.area(), ccorr, stream);
|
||||
cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
|
||||
result_data.ptr<cufftReal>()));
|
||||
|
||||
Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
|
||||
std::min(y + block_size.height, result.rows) - y);
|
||||
GpuMat result_roi(result_roi_size, result.type(),
|
||||
(void*)(result.ptr<float>(y) + x), result.step);
|
||||
GpuMat result_block(result_roi_size, result_data.type(),
|
||||
result_data.ptr(), result_data.step);
|
||||
|
||||
if (stream)
|
||||
stream.enqueueCopy(result_block, result_roi);
|
||||
else
|
||||
result_block.copyTo(result_roi);
|
||||
}
|
||||
}
|
||||
|
||||
cufftSafeCall(cufftDestroy(planR2C));
|
||||
cufftSafeCall(cufftDestroy(planC2R));
|
||||
}
|
||||
|
||||
cufftSafeCall(cufftDestroy(planR2C));
|
||||
cufftSafeCall(cufftDestroy(planC2R));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user