From be0c20b7582bbe89fdbd5b611d6d0ea657294e40 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" <marina.kolpakova@itseez.com> Date: Sat, 24 Nov 2012 01:55:03 +0400 Subject: [PATCH 1/6] align grid by 4 --- modules/gpu/src/cuda/integral_image.cu | 13 ++++++++++--- modules/gpu/src/imgproc.cpp | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu index 558f9085d..a34a52a31 100644 --- a/modules/gpu/src/cuda/integral_image.cu +++ b/modules/gpu/src/cuda/integral_image.cu @@ -357,18 +357,25 @@ namespace cv { namespace gpu { namespace device #endif } - void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream) + void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream) { { // each thread handles 16 values, use 1 block/row - const int block = img.cols / 16; + int block = img.cols / 16; + + // save, becouse step is actually can't be less 512 bytes + int align = img.cols % 4; + if ( align != 0) + { + block += (4 - align); + } // launch 1 block / row const int grid = img.rows; cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); - shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral); + shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral); cudaSafeCall( cudaGetLastError() ); } diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 0bf9c81c2..81a2248fd 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { - void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream); + void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream); } }}} From 68d04d28b6d1bfec6466b737dd6bc9ac41425e81 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" <marina.kolpakova@itseez.com> Date: Sat, 24 Nov 2012 16:50:29 +0400 Subject: [PATCH 2/6] replace offsets in surf to simple copy for better speed --- .../include/opencv2/gpu/device/utility.hpp | 2 +- modules/gpu/src/cuda/surf.cu | 32 +++++++++---------- modules/gpu/src/surf.cpp | 18 +++++++---- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp index 4489a20b1..88a73a10e 100644 --- a/modules/gpu/include/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp @@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device return true; } - static __device__ __forceinline__ bool check(int, int, int, uint offset = 0) + static __device__ __forceinline__ bool check(int, int, int) { return true; } diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 8c80559c5..37c4eb48a 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -177,7 +177,7 @@ namespace cv { namespace gpu { namespace device return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } - __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace, uint sumOffset) + __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace) { // Determine the indices const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2); @@ -198,9 +198,9 @@ namespace cv { namespace gpu { namespace device if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) { - const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), sumOffset + (j << c_octave)); - const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), sumOffset + (j << c_octave)); - const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), sumOffset + (j << c_octave)); + const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave)); + const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave)); + const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave)); det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy; trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy; @@ -208,7 +208,7 @@ namespace cv { namespace gpu { namespace device } void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, - int octave, int nOctaveLayers, const size_t sumOffset) + int octave, int nOctaveLayers) { const int min_size = calcSize(octave, 0); const int max_samples_i = 1 + ((img_rows - min_size) >> octave); @@ -220,7 +220,7 @@ namespace cv { namespace gpu { namespace device grid.x = divUp(max_samples_j, threads.x); grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2); - icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace, (uint)sumOffset); + icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); @@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace device struct WithMask { - static __device__ bool check(int sum_i, int sum_j, int size, const uint offset) + static __device__ bool check(int sum_i, int sum_j, int size) { float ratio = (float)size / 9.0f; @@ -245,10 +245,10 @@ namespace cv { namespace gpu { namespace device int dy2 = __float2int_rn(ratio * c_DM[3]); float t = 0; - t += tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy1); - t -= tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy2); - t -= tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy1); - t += tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy2); + t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1); + t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2); + t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1); + t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2); d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); @@ -258,7 +258,7 @@ namespace cv { namespace gpu { namespace device template <typename Mask> __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, - unsigned int* maxCounter, const uint maskOffset) + unsigned int* maxCounter) { #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110 @@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace device const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; - if (Mask::check(sum_i, sum_j, size, maskOffset)) + if (Mask::check(sum_i, sum_j, size)) { // Check to see if we have a max (in its 26 neighbours) const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff] @@ -351,7 +351,7 @@ namespace cv { namespace gpu { namespace device } void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, - int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers, const size_t maskOffset) + int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers) { const int layer_rows = img_rows >> octave; const int layer_cols = img_cols >> octave; @@ -367,9 +367,9 @@ namespace cv { namespace gpu { namespace device const size_t smem_size = threads.x * threads.y * 3 * sizeof(float); if (use_mask) - icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter, (uint)maskOffset); + icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter); else - icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter, 0); + icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter); cudaSafeCall( cudaGetLastError() ); diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index 72bb9c15e..05e225be2 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -75,10 +75,10 @@ namespace cv { namespace gpu { namespace device size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum); void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, - int octave, int nOctaveLayers, const size_t sumOffset); + int octave, int nOctaveLayer); void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, - int img_rows, int img_cols, int octave, bool use_mask, int nLayers, const size_t maskOffset); + int img_rows, int img_cols, int octave, bool use_mask, int nLayers); void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian, @@ -146,14 +146,17 @@ namespace loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold)); bindImgTex(img); - integralBuffered(img, surf_.sum, surf_.intBuffer); + + integralBuffered(img, tmpSum, surf_.intBuffer); + tmpSum.copyTo(surf_.sum); sumOffset = bindSumTex(surf_.sum); if (use_mask) { min(mask, 1.0, surf_.mask1); - integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer); + integralBuffered(surf_.mask1, tmpMaskSum, surf_.intBuffer); + tmpMaskSum.copyTo(surf_.maskSum); maskOffset = bindMaskSumTex(surf_.maskSum); } } @@ -174,10 +177,10 @@ namespace loadOctaveConstants(octave, layer_rows, layer_cols); - icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers, sumOffset); + icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers); icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave, - img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers, maskOffset); + img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers); unsigned int maxCounter; cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); @@ -228,6 +231,9 @@ namespace private: SURF_GPU& surf_; + cv::gpu::GpuMat tmpSum; + cv::gpu::GpuMat tmpMaskSum; + int img_cols, img_rows; bool use_mask; From 5460cee9e96bf7e64a09cf8fd9cd81fc05ecef04 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" <marina.kolpakova@itseez.com> Date: Sun, 25 Nov 2012 03:19:24 +0400 Subject: [PATCH 3/6] fix cascade classifier GFF NMS for empty candidates vector --- modules/gpu/src/cuda/lbp.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index 9b729fe63..55f5d7512 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -185,6 +185,7 @@ namespace cv { namespace gpu { namespace device void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) { + if (!ncandidates) return; int block = ncandidates; int smem = block * ( sizeof(int) + sizeof(int4) ); disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses); From 91913364d6cb854455c1b362dcdb8cb5f6742044 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" <marina.kolpakova@itseez.com> Date: Sun, 25 Nov 2012 03:21:51 +0400 Subject: [PATCH 4/6] reintegrate warp shuffle based integral --- modules/gpu/src/cuda/integral_image.cu | 8 +----- modules/gpu/src/imgproc.cpp | 39 +++++++------------------- 2 files changed, 11 insertions(+), 36 deletions(-) diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu index a34a52a31..09187fd25 100644 --- a/modules/gpu/src/cuda/integral_image.cu +++ b/modules/gpu/src/cuda/integral_image.cu @@ -361,14 +361,8 @@ namespace cv { namespace gpu { namespace device { { // each thread handles 16 values, use 1 block/row - int block = img.cols / 16; - // save, becouse step is actually can't be less 512 bytes - int align = img.cols % 4; - if ( align != 0) - { - block += (4 - align); - } + int block = integral.cols / 16; // launch 1 block / row const int grid = img.rows; diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 81a2248fd..309b14ae9 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -553,44 +553,25 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S src.locateROI(whole, offset); - if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048) + if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 && offset.x % 16 == 0 && (src.cols + 63) / 64 <= (src.step - offset.x)) { - GpuMat srcAlligned; + ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer); - if (src.cols % 16 == 0 && src.rows % 8 == 0 && offset.x % 16 == 0 && offset.y % 8 == 0) - srcAlligned = src; - else - { - ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 15) / 16) * 16, src.type(), buffer); - - GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows)); - - if (s) - { - s.enqueueMemSet(buffer, Scalar::all(0)); - s.enqueueCopy(src, inner); - } - else - { - buffer.setTo(Scalar::all(0)); - src.copyTo(inner); - } - - srcAlligned = buffer; - } - - sum.create(srcAlligned.rows + 1, srcAlligned.cols + 4, CV_32SC1); + cv::gpu::device::imgproc::shfl_integral_gpu(src, buffer, stream); + sum.create(src.rows + 1, src.cols + 1, CV_32SC1); if (s) s.enqueueMemSet(sum, Scalar::all(0)); else sum.setTo(Scalar::all(0)); - GpuMat inner = sum(Rect(4, 1, srcAlligned.cols, srcAlligned.rows)); + GpuMat inner = sum(Rect(1, 1, src.cols, src.rows)); + GpuMat res = buffer(Rect(0, 0, src.cols, src.rows)); - cv::gpu::device::imgproc::shfl_integral_gpu(srcAlligned, inner, stream); - - sum = sum(Rect(3, 0, src.cols + 1, src.rows + 1)); + if (s) + s.enqueueCopy(res, inner); + else + res.copyTo(inner); } else { From 7df45c0dcc5ccbe0d15bab8a180c1d6ccd710efa Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" <marina.kolpakova@itseez.com> Date: Sun, 25 Nov 2012 03:26:50 +0400 Subject: [PATCH 5/6] remove unnecessary copying in SURF --- modules/gpu/src/surf.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index 05e225be2..5a1b07444 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -147,16 +147,13 @@ namespace bindImgTex(img); - integralBuffered(img, tmpSum, surf_.intBuffer); - tmpSum.copyTo(surf_.sum); - + integralBuffered(img, surf_.sum, surf_.intBuffer); sumOffset = bindSumTex(surf_.sum); if (use_mask) { min(mask, 1.0, surf_.mask1); - integralBuffered(surf_.mask1, tmpMaskSum, surf_.intBuffer); - tmpMaskSum.copyTo(surf_.maskSum); + integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer); maskOffset = bindMaskSumTex(surf_.maskSum); } } @@ -231,9 +228,6 @@ namespace private: SURF_GPU& surf_; - cv::gpu::GpuMat tmpSum; - cv::gpu::GpuMat tmpMaskSum; - int img_cols, img_rows; bool use_mask; From a22edb037f211291f3c7bfc0ecc9bea3322960e2 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" <marina.kolpakova@itseez.com> Date: Mon, 26 Nov 2012 17:57:56 +0400 Subject: [PATCH 6/6] fixed typo --- modules/gpu/src/imgproc.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 309b14ae9..7f63737ef 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -553,7 +553,8 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S src.locateROI(whole, offset); - if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 && offset.x % 16 == 0 && (src.cols + 63) / 64 <= (src.step - offset.x)) + if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 + && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (src.step - offset.x)) { ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);