From 296aa7c4fbf29b3a76773e23e576fb8e4b6a071c Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Fri, 12 Oct 2012 13:02:35 +0400 Subject: [PATCH] One more fix for Kepler-specific gpu::integral usage --- .../include/opencv2/gpu/device/utility.hpp | 2 +- modules/gpu/src/cuda/surf.cu | 49 +++++++++++-------- modules/gpu/src/imgproc.cpp | 2 +- modules/gpu/src/surf.cpp | 24 +++++---- 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp index 072f42d77..5e2e5eaa2 100644 --- a/modules/gpu/include/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp @@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device return true; } - static __device__ __forceinline__ bool check(int, int, int) + static __device__ __forceinline__ bool check(int, int, int, uint offset = 0) { return true; } diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 509132f57..3f21fd6d7 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -108,13 +108,20 @@ namespace cv { namespace gpu { namespace device { bindTexture(&imgTex, img); } - void bindSumTex(PtrStepSz sum) + + size_t bindSumTex(PtrStepSz sum) { - bindTexture(&sumTex, sum); + size_t offset; + cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step)); + return offset / sizeof(uint); } - void bindMaskSumTex(PtrStepSz maskSum) + size_t bindMaskSumTex(PtrStepSz maskSum) { - bindTexture(&maskSumTex, maskSum); + size_t offset; + cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step)); + return offset / sizeof(uint); } template __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x) @@ -170,7 +177,7 @@ namespace cv { namespace gpu { namespace device return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } - __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace) + __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace, uint sumOffset) { // Determine the indices const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2); @@ -191,16 +198,17 @@ namespace cv { namespace gpu { namespace device if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) { - const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, i << c_octave, j << c_octave); - const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, i << c_octave, j << c_octave); - const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, i << c_octave, j << c_octave); + const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), sumOffset + (j << c_octave)); + const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), sumOffset + (j << c_octave)); + const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), sumOffset + (j << c_octave)); det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy; trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy; } } - void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers) + void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, + int octave, int nOctaveLayers, const size_t sumOffset) { const int min_size = calcSize(octave, 0); const int max_samples_i = 1 + ((img_rows - min_size) >> octave); @@ -212,7 +220,7 @@ namespace cv { namespace gpu { namespace device grid.x = divUp(max_samples_j, threads.x); grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2); - icvCalcLayerDetAndTrace<<>>(det, trace); + icvCalcLayerDetAndTrace<<>>(det, trace, (uint)sumOffset); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); @@ -225,7 +233,7 @@ namespace cv { namespace gpu { namespace device struct WithMask { - static __device__ bool check(int sum_i, int sum_j, int size) + static __device__ bool check(int sum_i, int sum_j, int size, const uint offset) { float ratio = (float)size / 9.0f; @@ -237,10 +245,10 @@ namespace cv { namespace gpu { namespace device int dy2 = __float2int_rn(ratio * c_DM[3]); float t = 0; - t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1); - t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2); - t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1); - t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2); + t += tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy1); + t -= tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy2); + t -= tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy1); + t += tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy2); d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); @@ -249,7 +257,8 @@ namespace cv { namespace gpu { namespace device }; template - __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter) + __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, + unsigned int* maxCounter, const uint maskOffset) { #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110 @@ -290,7 +299,7 @@ namespace cv { namespace gpu { namespace device const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; - if (Mask::check(sum_i, sum_j, size)) + if (Mask::check(sum_i, sum_j, size, maskOffset)) { // Check to see if we have a max (in its 26 neighbours) const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff] @@ -342,7 +351,7 @@ namespace cv { namespace gpu { namespace device } void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, - int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers) + int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers, const size_t maskOffset) { const int layer_rows = img_rows >> octave; const int layer_cols = img_cols >> octave; @@ -358,9 +367,9 @@ namespace cv { namespace gpu { namespace device const size_t smem_size = threads.x * threads.y * 3 * sizeof(float); if (use_mask) - icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter); + icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter, (uint)maskOffset); else - icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter); + icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter, 0); cudaSafeCall( cudaGetLastError() ); diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index eb1fff7e5..2a9650951 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -553,7 +553,7 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S src.locateROI(whole, offset); - if (info.supports(WARP_SHUFFLE_FUNCTIONS) ) + if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048) { GpuMat srcAlligned; diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index 99c3370dd..a4a2acf5e 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -71,13 +71,14 @@ namespace cv { namespace gpu { namespace device void loadOctaveConstants(int octave, int layer_rows, int layer_cols); void bindImgTex(PtrStepSzb img); - void bindSumTex(PtrStepSz sum); - void bindMaskSumTex(PtrStepSz maskSum); + size_t bindSumTex(PtrStepSz sum); + size_t bindMaskSumTex(PtrStepSz maskSum); - void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers); + void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, + int octave, int nOctaveLayers, const size_t sumOffset); void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, - int img_rows, int img_cols, int octave, bool use_mask, int nLayers); + int img_rows, int img_cols, int octave, bool use_mask, int nLayers, const size_t maskOffset); void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian, @@ -145,15 +146,17 @@ namespace loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast(surf_.hessianThreshold)); bindImgTex(img); - integralBuffered(img, surf_.sum, surf_.intBuffer); - bindSumTex(surf_.sum); + + sumOffset = bindSumTex(surf_.sum); + + return; if (use_mask) { min(mask, 1.0, surf_.mask1); integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer); - bindMaskSumTex(surf_.maskSum); + maskOffset = bindMaskSumTex(surf_.maskSum); } } @@ -173,10 +176,10 @@ namespace loadOctaveConstants(octave, layer_rows, layer_cols); - icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers); + icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers, sumOffset); icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr(), counters.ptr() + 1 + octave, - img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers); + img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers, maskOffset); unsigned int maxCounter; cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); @@ -234,6 +237,9 @@ namespace int maxCandidates; int maxFeatures; + size_t maskOffset; + size_t sumOffset; + GpuMat counters; }; }