From 68d04d28b6d1bfec6466b737dd6bc9ac41425e81 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Sat, 24 Nov 2012 16:50:29 +0400 Subject: [PATCH] replace offsets in surf to simple copy for better speed --- .../include/opencv2/gpu/device/utility.hpp | 2 +- modules/gpu/src/cuda/surf.cu | 32 +++++++++---------- modules/gpu/src/surf.cpp | 18 +++++++---- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp index 4489a20b1..88a73a10e 100644 --- a/modules/gpu/include/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp @@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device return true; } - static __device__ __forceinline__ bool check(int, int, int, uint offset = 0) + static __device__ __forceinline__ bool check(int, int, int) { return true; } diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 8c80559c5..37c4eb48a 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -177,7 +177,7 @@ namespace cv { namespace gpu { namespace device return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } - __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace, uint sumOffset) + __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace) { // Determine the indices const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2); @@ -198,9 +198,9 @@ namespace cv { namespace gpu { namespace device if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) { - const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), sumOffset + (j << c_octave)); - const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), sumOffset + (j << c_octave)); - const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), sumOffset + (j << c_octave)); + const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave)); + const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave)); + const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave)); det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy; trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy; @@ -208,7 +208,7 @@ namespace cv { namespace gpu { namespace device } void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, - int octave, int nOctaveLayers, const size_t sumOffset) + int octave, int nOctaveLayers) { const int min_size = calcSize(octave, 0); const int max_samples_i = 1 + ((img_rows - min_size) >> octave); @@ -220,7 +220,7 @@ namespace cv { namespace gpu { namespace device grid.x = divUp(max_samples_j, threads.x); grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2); - icvCalcLayerDetAndTrace<<>>(det, trace, (uint)sumOffset); + icvCalcLayerDetAndTrace<<>>(det, trace); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); @@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace device struct WithMask { - static __device__ bool check(int sum_i, int sum_j, int size, const uint offset) + static __device__ bool check(int sum_i, int sum_j, int size) { float ratio = (float)size / 9.0f; @@ -245,10 +245,10 @@ namespace cv { namespace gpu { namespace device int dy2 = __float2int_rn(ratio * c_DM[3]); float t = 0; - t += tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy1); - t -= tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy2); - t -= tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy1); - t += tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy2); + t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1); + t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2); + t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1); + t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2); d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); @@ -258,7 +258,7 @@ namespace cv { namespace gpu { namespace device template __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, - unsigned int* maxCounter, const uint maskOffset) + unsigned int* maxCounter) { #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110 @@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace device const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; - if (Mask::check(sum_i, sum_j, size, maskOffset)) + if (Mask::check(sum_i, sum_j, size)) { // Check to see if we have a max (in its 26 neighbours) const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff] @@ -351,7 +351,7 @@ namespace cv { namespace gpu { namespace device } void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, - int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers, const size_t maskOffset) + int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers) { const int layer_rows = img_rows >> octave; const int layer_cols = img_cols >> octave; @@ -367,9 +367,9 @@ namespace cv { namespace gpu { namespace device const size_t smem_size = threads.x * threads.y * 3 * sizeof(float); if (use_mask) - icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter, (uint)maskOffset); + icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter); else - icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter, 0); + icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter); cudaSafeCall( cudaGetLastError() ); diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index 72bb9c15e..05e225be2 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -75,10 +75,10 @@ namespace cv { namespace gpu { namespace device size_t bindMaskSumTex(PtrStepSz maskSum); void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, - int octave, int nOctaveLayers, const size_t sumOffset); + int octave, int nOctaveLayer); void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, - int img_rows, int img_cols, int octave, bool use_mask, int nLayers, const size_t maskOffset); + int img_rows, int img_cols, int octave, bool use_mask, int nLayers); void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian, @@ -146,14 +146,17 @@ namespace loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast(surf_.hessianThreshold)); bindImgTex(img); - integralBuffered(img, surf_.sum, surf_.intBuffer); + + integralBuffered(img, tmpSum, surf_.intBuffer); + tmpSum.copyTo(surf_.sum); sumOffset = bindSumTex(surf_.sum); if (use_mask) { min(mask, 1.0, surf_.mask1); - integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer); + integralBuffered(surf_.mask1, tmpMaskSum, surf_.intBuffer); + tmpMaskSum.copyTo(surf_.maskSum); maskOffset = bindMaskSumTex(surf_.maskSum); } } @@ -174,10 +177,10 @@ namespace loadOctaveConstants(octave, layer_rows, layer_cols); - icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers, sumOffset); + icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers); icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr(), counters.ptr() + 1 + octave, - img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers, maskOffset); + img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers); unsigned int maxCounter; cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); @@ -228,6 +231,9 @@ namespace private: SURF_GPU& surf_; + cv::gpu::GpuMat tmpSum; + cv::gpu::GpuMat tmpMaskSum; + int img_cols, img_rows; bool use_mask;