diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp index 4489a20b1..88a73a10e 100644 --- a/modules/gpu/include/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp @@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device return true; } - static __device__ __forceinline__ bool check(int, int, int, uint offset = 0) + static __device__ __forceinline__ bool check(int, int, int) { return true; } diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu index 558f9085d..09187fd25 100644 --- a/modules/gpu/src/cuda/integral_image.cu +++ b/modules/gpu/src/cuda/integral_image.cu @@ -357,18 +357,19 @@ namespace cv { namespace gpu { namespace device #endif } - void shfl_integral_gpu(PtrStepSzb img, PtrStepSz integral, cudaStream_t stream) + void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz integral, cudaStream_t stream) { { // each thread handles 16 values, use 1 block/row - const int block = img.cols / 16; + // save, becouse step is actually can't be less 512 bytes + int block = integral.cols / 16; // launch 1 block / row const int grid = img.rows; cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); - shfl_integral_horizontal<<>>((PtrStepSz) img, (PtrStepSz) integral); + shfl_integral_horizontal<<>>((const PtrStepSz) img, (PtrStepSz) integral); cudaSafeCall( cudaGetLastError() ); } diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index 9b729fe63..55f5d7512 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -185,6 +185,7 @@ namespace cv { namespace gpu { namespace device void connectedConmonents(PtrStepSz candidates, int ncandidates, PtrStepSz objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) { + if (!ncandidates) return; int block = ncandidates; int smem = block * ( sizeof(int) + sizeof(int4) ); disjoin<<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses); diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 8c80559c5..37c4eb48a 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -177,7 +177,7 @@ namespace cv { namespace gpu { namespace device return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } - __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace, uint sumOffset) + __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace) { // Determine the indices const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2); @@ -198,9 +198,9 @@ namespace cv { namespace gpu { namespace device if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) { - const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), sumOffset + (j << c_octave)); - const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), sumOffset + (j << c_octave)); - const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), sumOffset + (j << c_octave)); + const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave)); + const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave)); + const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave)); det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy; trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy; @@ -208,7 +208,7 @@ namespace cv { namespace gpu { namespace device } void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, - int octave, int nOctaveLayers, const size_t sumOffset) + int octave, int nOctaveLayers) { const int min_size = calcSize(octave, 0); const int max_samples_i = 1 + ((img_rows - min_size) >> octave); @@ -220,7 +220,7 @@ namespace cv { namespace gpu { namespace device grid.x = divUp(max_samples_j, threads.x); grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2); - icvCalcLayerDetAndTrace<<>>(det, trace, (uint)sumOffset); + icvCalcLayerDetAndTrace<<>>(det, trace); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); @@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace device struct WithMask { - static __device__ bool check(int sum_i, int sum_j, int size, const uint offset) + static __device__ bool check(int sum_i, int sum_j, int size) { float ratio = (float)size / 9.0f; @@ -245,10 +245,10 @@ namespace cv { namespace gpu { namespace device int dy2 = __float2int_rn(ratio * c_DM[3]); float t = 0; - t += tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy1); - t -= tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy2); - t -= tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy1); - t += tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy2); + t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1); + t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2); + t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1); + t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2); d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); @@ -258,7 +258,7 @@ namespace cv { namespace gpu { namespace device template __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, - unsigned int* maxCounter, const uint maskOffset) + unsigned int* maxCounter) { #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110 @@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace device const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave; const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave; - if (Mask::check(sum_i, sum_j, size, maskOffset)) + if (Mask::check(sum_i, sum_j, size)) { // Check to see if we have a max (in its 26 neighbours) const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff] @@ -351,7 +351,7 @@ namespace cv { namespace gpu { namespace device } void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, - int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers, const size_t maskOffset) + int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers) { const int layer_rows = img_rows >> octave; const int layer_cols = img_cols >> octave; @@ -367,9 +367,9 @@ namespace cv { namespace gpu { namespace device const size_t smem_size = threads.x * threads.y * 3 * sizeof(float); if (use_mask) - icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter, (uint)maskOffset); + icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter); else - icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter, 0); + icvFindMaximaInLayer<<>>(det, trace, maxPosBuffer, maxCounter); cudaSafeCall( cudaGetLastError() ); diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 0bf9c81c2..7f63737ef 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { - void shfl_integral_gpu(PtrStepSzb img, PtrStepSz integral, cudaStream_t stream); + void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz integral, cudaStream_t stream); } }}} @@ -553,44 +553,26 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S src.locateROI(whole, offset); - if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048) + if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 + && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (src.step - offset.x)) { - GpuMat srcAlligned; + ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer); - if (src.cols % 16 == 0 && src.rows % 8 == 0 && offset.x % 16 == 0 && offset.y % 8 == 0) - srcAlligned = src; - else - { - ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 15) / 16) * 16, src.type(), buffer); - - GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows)); - - if (s) - { - s.enqueueMemSet(buffer, Scalar::all(0)); - s.enqueueCopy(src, inner); - } - else - { - buffer.setTo(Scalar::all(0)); - src.copyTo(inner); - } - - srcAlligned = buffer; - } - - sum.create(srcAlligned.rows + 1, srcAlligned.cols + 4, CV_32SC1); + cv::gpu::device::imgproc::shfl_integral_gpu(src, buffer, stream); + sum.create(src.rows + 1, src.cols + 1, CV_32SC1); if (s) s.enqueueMemSet(sum, Scalar::all(0)); else sum.setTo(Scalar::all(0)); - GpuMat inner = sum(Rect(4, 1, srcAlligned.cols, srcAlligned.rows)); + GpuMat inner = sum(Rect(1, 1, src.cols, src.rows)); + GpuMat res = buffer(Rect(0, 0, src.cols, src.rows)); - cv::gpu::device::imgproc::shfl_integral_gpu(srcAlligned, inner, stream); - - sum = sum(Rect(3, 0, src.cols + 1, src.rows + 1)); + if (s) + s.enqueueCopy(res, inner); + else + res.copyTo(inner); } else { diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index 72bb9c15e..5a1b07444 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -75,10 +75,10 @@ namespace cv { namespace gpu { namespace device size_t bindMaskSumTex(PtrStepSz maskSum); void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, - int octave, int nOctaveLayers, const size_t sumOffset); + int octave, int nOctaveLayer); void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, - int img_rows, int img_cols, int octave, bool use_mask, int nLayers, const size_t maskOffset); + int img_rows, int img_cols, int octave, bool use_mask, int nLayers); void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian, @@ -146,8 +146,8 @@ namespace loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast(surf_.hessianThreshold)); bindImgTex(img); - integralBuffered(img, surf_.sum, surf_.intBuffer); + integralBuffered(img, surf_.sum, surf_.intBuffer); sumOffset = bindSumTex(surf_.sum); if (use_mask) @@ -174,10 +174,10 @@ namespace loadOctaveConstants(octave, layer_rows, layer_cols); - icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers, sumOffset); + icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers); icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr(), counters.ptr() + 1 + octave, - img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers, maskOffset); + img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers); unsigned int maxCounter; cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );