One more fix for Kepler-specific gpu::integral usage
This commit is contained in:
		@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
            return true;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        static __device__ __forceinline__ bool check(int, int, int)
 | 
			
		||||
        static __device__ __forceinline__ bool check(int, int, int, uint offset = 0)
 | 
			
		||||
        {
 | 
			
		||||
            return true;
 | 
			
		||||
        }
 | 
			
		||||
 
 | 
			
		||||
@@ -108,13 +108,20 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
        {
 | 
			
		||||
            bindTexture(&imgTex, img);
 | 
			
		||||
        }
 | 
			
		||||
        void bindSumTex(PtrStepSz<uint> sum)
 | 
			
		||||
 | 
			
		||||
        size_t bindSumTex(PtrStepSz<uint> sum)
 | 
			
		||||
        {
 | 
			
		||||
            bindTexture(&sumTex, sum);
 | 
			
		||||
            size_t offset;
 | 
			
		||||
            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
 | 
			
		||||
            cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
 | 
			
		||||
            return offset / sizeof(uint);
 | 
			
		||||
        }
 | 
			
		||||
        void bindMaskSumTex(PtrStepSz<uint> maskSum)
 | 
			
		||||
        size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
 | 
			
		||||
        {
 | 
			
		||||
            bindTexture(&maskSumTex, maskSum);
 | 
			
		||||
            size_t offset;
 | 
			
		||||
            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
 | 
			
		||||
            cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
 | 
			
		||||
            return offset / sizeof(uint);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
 | 
			
		||||
@@ -170,7 +177,7 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
            return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
 | 
			
		||||
        __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace, uint sumOffset)
 | 
			
		||||
        {
 | 
			
		||||
            // Determine the indices
 | 
			
		||||
            const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
 | 
			
		||||
@@ -191,16 +198,17 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
 | 
			
		||||
            if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
 | 
			
		||||
            {
 | 
			
		||||
                const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, i << c_octave, j << c_octave);
 | 
			
		||||
                const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, i << c_octave, j << c_octave);
 | 
			
		||||
                const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, i << c_octave, j << c_octave);
 | 
			
		||||
                const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), sumOffset + (j << c_octave));
 | 
			
		||||
                const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), sumOffset + (j << c_octave));
 | 
			
		||||
                const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), sumOffset + (j << c_octave));
 | 
			
		||||
 | 
			
		||||
                det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
 | 
			
		||||
                trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)
 | 
			
		||||
        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
 | 
			
		||||
            int octave, int nOctaveLayers, const size_t sumOffset)
 | 
			
		||||
        {
 | 
			
		||||
            const int min_size = calcSize(octave, 0);
 | 
			
		||||
            const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
 | 
			
		||||
@@ -212,7 +220,7 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
            grid.x = divUp(max_samples_j, threads.x);
 | 
			
		||||
            grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
 | 
			
		||||
 | 
			
		||||
            icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
 | 
			
		||||
            icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace, (uint)sumOffset);
 | 
			
		||||
            cudaSafeCall( cudaGetLastError() );
 | 
			
		||||
 | 
			
		||||
            cudaSafeCall( cudaDeviceSynchronize() );
 | 
			
		||||
@@ -225,7 +233,7 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
 | 
			
		||||
        struct WithMask
 | 
			
		||||
        {
 | 
			
		||||
            static __device__ bool check(int sum_i, int sum_j, int size)
 | 
			
		||||
            static __device__ bool check(int sum_i, int sum_j, int size, const uint offset)
 | 
			
		||||
            {
 | 
			
		||||
                float ratio = (float)size / 9.0f;
 | 
			
		||||
 | 
			
		||||
@@ -237,10 +245,10 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
                int dy2 = __float2int_rn(ratio * c_DM[3]);
 | 
			
		||||
 | 
			
		||||
                float t = 0;
 | 
			
		||||
                t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
 | 
			
		||||
                t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
 | 
			
		||||
                t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
 | 
			
		||||
                t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
 | 
			
		||||
                t += tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy1);
 | 
			
		||||
                t -= tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy2);
 | 
			
		||||
                t -= tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy1);
 | 
			
		||||
                t += tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy2);
 | 
			
		||||
 | 
			
		||||
                d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
 | 
			
		||||
 | 
			
		||||
@@ -249,7 +257,8 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        template <typename Mask>
 | 
			
		||||
        __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
 | 
			
		||||
        __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
 | 
			
		||||
            unsigned int* maxCounter, const uint maskOffset)
 | 
			
		||||
        {
 | 
			
		||||
            #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
 | 
			
		||||
 | 
			
		||||
@@ -290,7 +299,7 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
                    const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
 | 
			
		||||
                    const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
 | 
			
		||||
 | 
			
		||||
                    if (Mask::check(sum_i, sum_j, size))
 | 
			
		||||
                    if (Mask::check(sum_i, sum_j, size, maskOffset))
 | 
			
		||||
                    {
 | 
			
		||||
                        // Check to see if we have a max (in its 26 neighbours)
 | 
			
		||||
                        const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
 | 
			
		||||
@@ -342,7 +351,7 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
 | 
			
		||||
            int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
 | 
			
		||||
            int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers, const size_t maskOffset)
 | 
			
		||||
        {
 | 
			
		||||
            const int layer_rows = img_rows >> octave;
 | 
			
		||||
            const int layer_cols = img_cols >> octave;
 | 
			
		||||
@@ -358,9 +367,9 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
            const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
 | 
			
		||||
 | 
			
		||||
            if (use_mask)
 | 
			
		||||
                icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
 | 
			
		||||
                icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter, (uint)maskOffset);
 | 
			
		||||
            else
 | 
			
		||||
                icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
 | 
			
		||||
                icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter, 0);
 | 
			
		||||
 | 
			
		||||
            cudaSafeCall( cudaGetLastError() );
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -553,7 +553,7 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
 | 
			
		||||
 | 
			
		||||
    src.locateROI(whole, offset);
 | 
			
		||||
 | 
			
		||||
    if (info.supports(WARP_SHUFFLE_FUNCTIONS) )
 | 
			
		||||
    if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048)
 | 
			
		||||
    {
 | 
			
		||||
        GpuMat srcAlligned;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -71,13 +71,14 @@ namespace cv { namespace gpu { namespace device
 | 
			
		||||
        void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
 | 
			
		||||
 | 
			
		||||
        void bindImgTex(PtrStepSzb img);
 | 
			
		||||
        void bindSumTex(PtrStepSz<unsigned int> sum);
 | 
			
		||||
        void bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
 | 
			
		||||
        size_t bindSumTex(PtrStepSz<unsigned int> sum);
 | 
			
		||||
        size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
 | 
			
		||||
 | 
			
		||||
        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);
 | 
			
		||||
        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
 | 
			
		||||
            int octave, int nOctaveLayers, const size_t sumOffset);
 | 
			
		||||
 | 
			
		||||
        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
 | 
			
		||||
            int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
 | 
			
		||||
            int img_rows, int img_cols, int octave, bool use_mask, int nLayers, const size_t maskOffset);
 | 
			
		||||
 | 
			
		||||
        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
 | 
			
		||||
            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
 | 
			
		||||
@@ -145,15 +146,17 @@ namespace
 | 
			
		||||
            loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
 | 
			
		||||
 | 
			
		||||
            bindImgTex(img);
 | 
			
		||||
 | 
			
		||||
            integralBuffered(img, surf_.sum, surf_.intBuffer);
 | 
			
		||||
            bindSumTex(surf_.sum);
 | 
			
		||||
 | 
			
		||||
            sumOffset = bindSumTex(surf_.sum);
 | 
			
		||||
 | 
			
		||||
            return;
 | 
			
		||||
 | 
			
		||||
            if (use_mask)
 | 
			
		||||
            {
 | 
			
		||||
                min(mask, 1.0, surf_.mask1);
 | 
			
		||||
                integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer);
 | 
			
		||||
                bindMaskSumTex(surf_.maskSum);
 | 
			
		||||
                maskOffset = bindMaskSumTex(surf_.maskSum);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@@ -173,10 +176,10 @@ namespace
 | 
			
		||||
 | 
			
		||||
                loadOctaveConstants(octave, layer_rows, layer_cols);
 | 
			
		||||
 | 
			
		||||
                icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
 | 
			
		||||
                icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers, sumOffset);
 | 
			
		||||
 | 
			
		||||
                icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
 | 
			
		||||
                    img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
 | 
			
		||||
                    img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers, maskOffset);
 | 
			
		||||
 | 
			
		||||
                unsigned int maxCounter;
 | 
			
		||||
                cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
 | 
			
		||||
@@ -234,6 +237,9 @@ namespace
 | 
			
		||||
        int maxCandidates;
 | 
			
		||||
        int maxFeatures;
 | 
			
		||||
 | 
			
		||||
        size_t maskOffset;
 | 
			
		||||
        size_t sumOffset;
 | 
			
		||||
 | 
			
		||||
        GpuMat counters;
 | 
			
		||||
    };
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user