From a008d6fc17bf13d3b9724794cdb5da7791c3cbea Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 10 Oct 2012 16:55:16 +0400 Subject: [PATCH] fixed bug #2425 : Concurrent convolutions with streams --- modules/gpu/perf/perf_video.cpp | 2 +- modules/gpu/src/cuda/column_filter.cu | 23 +++++++++++--------- modules/gpu/src/cuda/imgproc.cu | 7 +++++-- modules/gpu/src/cuda/row_filter.cu | 23 +++++++++++--------- modules/gpu/src/filtering.cpp | 30 +++++++++++++-------------- 5 files changed, 46 insertions(+), 39 deletions(-) diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp index 14998417b..f7a9debd2 100644 --- a/modules/gpu/perf/perf_video.cpp +++ b/modules/gpu/perf/perf_video.cpp @@ -146,7 +146,7 @@ PERF_TEST_P(ImagePair, Video_CreateOpticalFlowNeedleMap, } GPU_SANITY_CHECK(d_vertex); - GPU_SANITY_CHECK(d_colors) + GPU_SANITY_CHECK(d_colors); } else { diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu index 7f5d24760..21e28a863 100644 --- a/modules/gpu/src/cuda/column_filter.cu +++ b/modules/gpu/src/cuda/column_filter.cu @@ -58,9 +58,12 @@ namespace cv { namespace gpu { namespace device __constant__ float c_kernel[MAX_KERNEL_SIZE]; - void loadKernel(const float kernel[], int ksize) + void loadKernel(const float* kernel, int ksize, cudaStream_t stream) { - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); } template @@ -185,7 +188,7 @@ namespace cv { namespace gpu { namespace device } template - void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) + void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) { typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); @@ -368,18 +371,18 @@ namespace cv { namespace gpu { namespace device } }; - loadKernel(kernel, ksize); + loadKernel(kernel, ksize, stream); callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); } - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); } // namespace column_filter }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 9a75c52fc..7fff18796 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -986,7 +986,10 @@ namespace cv { namespace gpu { namespace device Filter2DCaller::call }; - cudaSafeCall(cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); funcs[borderMode](static_cast< PtrStepSz >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream); } @@ -1001,4 +1004,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu index 7076f3870..7e739591b 100644 --- a/modules/gpu/src/cuda/row_filter.cu +++ b/modules/gpu/src/cuda/row_filter.cu @@ -58,9 +58,12 @@ namespace cv { namespace gpu { namespace device __constant__ float c_kernel[MAX_KERNEL_SIZE]; - void loadKernel(const float kernel[], int ksize) + void loadKernel(const float* kernel, int ksize, cudaStream_t stream) { - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); } template @@ -184,7 +187,7 @@ namespace cv { namespace gpu { namespace device } template - void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) + void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) { typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); @@ -367,18 +370,18 @@ namespace cv { namespace gpu { namespace device } }; - loadKernel(kernel, ksize); + loadKernel(kernel, ksize, stream); callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); } - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); } // namespace row_filter }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index 82ccd8568..fe4a68c0f 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -835,13 +835,13 @@ namespace cv { namespace gpu { namespace device namespace row_filter { template - void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); } namespace column_filter { template - void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); + void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); } }}} @@ -881,7 +881,7 @@ namespace struct GpuLinearRowFilter : public BaseRowFilter_GPU { - GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : + GpuLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) : BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) @@ -891,7 +891,7 @@ namespace func(src, dst, kernel.ptr(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s)); } - Mat kernel; + GpuMat kernel; gpuFilter1D_t func; int brd_type; }; @@ -926,11 +926,10 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType)); - Mat temp(rowKernel.size(), CV_32FC1); - rowKernel.convertTo(temp, CV_32FC1); - Mat cont_krnl = temp.reshape(1, 1); + GpuMat gpu_row_krnl; + normalizeKernel(rowKernel, gpu_row_krnl, CV_32F); - int ksize = cont_krnl.cols; + int ksize = gpu_row_krnl.cols; CV_Assert(ksize > 0 && ksize <= 32); @@ -957,7 +956,7 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, break; } - return Ptr(new GpuLinearRowFilter(ksize, anchor, cont_krnl, func, gpuBorderType)); + return Ptr(new GpuLinearRowFilter(ksize, anchor, gpu_row_krnl, func, gpuBorderType)); } namespace @@ -991,7 +990,7 @@ namespace struct GpuLinearColumnFilter : public BaseColumnFilter_GPU { - GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : + GpuLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) : BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) @@ -1004,7 +1003,7 @@ namespace func(src, dst, kernel.ptr(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s)); } - Mat kernel; + GpuMat kernel; gpuFilter1D_t func; int brd_type; }; @@ -1039,11 +1038,10 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType)); - Mat temp(columnKernel.size(), CV_32FC1); - columnKernel.convertTo(temp, CV_32FC1); - Mat cont_krnl = temp.reshape(1, 1); + GpuMat gpu_col_krnl; + normalizeKernel(columnKernel, gpu_col_krnl, CV_32F); - int ksize = cont_krnl.cols; + int ksize = gpu_col_krnl.cols; CV_Assert(ksize > 0 && ksize <= 32); @@ -1070,7 +1068,7 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds break; } - return Ptr(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, func, gpuBorderType)); + return Ptr(new GpuLinearColumnFilter(ksize, anchor, gpu_col_krnl, func, gpuBorderType)); } Ptr cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,