opencv/modules/gpu/src/cuda/fast.cu

386 lines
53 KiB
Plaintext
Raw Normal View History

2011-12-27 10:33:20 +01:00
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
//
// The original code was written by Paul Furgale and Chi Hay Tong
// and later optimized and prepared for integration into OpenCV by Itseez.
//
//M*/
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
namespace cv { namespace gpu { namespace device
{
namespace fast
{
__device__ unsigned int g_counter = 0;
///////////////////////////////////////////////////////////////////////////
// calcKeypoints
__constant__ uchar c_table[] = { 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xf0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0
// 1 -> v > x + th
// 2 -> v < x - th
// 0 -> x - th <= v <= x + th
__device__ __forceinline__ int diffType(const int v, const int x, const int th)
{
const int diff = x - v;
return static_cast<int>(diff < -th) + (static_cast<int>(diff > th) << 1);
}
__device__ void calcMask(const uint C[4], const int v, const int th, int& mask1, int& mask2)
{
mask1 = 0;
mask2 = 0;
int d1, d2;
d1 = diffType(v, C[0] & 0xff, th);
d2 = diffType(v, C[2] & 0xff, th);
if ((d1 | d2) == 0)
return;
mask1 |= (d1 & 1) << 0;
mask2 |= ((d1 & 2) >> 1) << 0;
mask1 |= (d2 & 1) << 8;
mask2 |= ((d2 & 2) >> 1) << 8;
d1 = diffType(v, C[1] & 0xff, th);
d2 = diffType(v, C[3] & 0xff, th);
if ((d1 | d2) == 0)
return;
mask1 |= (d1 & 1) << 4;
mask2 |= ((d1 & 2) >> 1) << 4;
mask1 |= (d2 & 1) << 12;
mask2 |= ((d2 & 2) >> 1) << 12;
d1 = diffType(v, (C[0] >> (2 * 8)) & 0xff, th);
d2 = diffType(v, (C[2] >> (2 * 8)) & 0xff, th);
if ((d1 | d2) == 0)
return;
mask1 |= (d1 & 1) << 2;
mask2 |= ((d1 & 2) >> 1) << 2;
mask1 |= (d2 & 1) << 10;
mask2 |= ((d2 & 2) >> 1) << 10;
d1 = diffType(v, (C[1] >> (2 * 8)) & 0xff, th);
d2 = diffType(v, (C[3] >> (2 * 8)) & 0xff, th);
if ((d1 | d2) == 0)
return;
mask1 |= (d1 & 1) << 6;
mask2 |= ((d1 & 2) >> 1) << 6;
mask1 |= (d2 & 1) << 14;
mask2 |= ((d2 & 2) >> 1) << 14;
d1 = diffType(v, (C[0] >> (1 * 8)) & 0xff, th);
d2 = diffType(v, (C[2] >> (1 * 8)) & 0xff, th);
/*if ((d1 | d2) == 0)
return;*/
mask1 |= (d1 & 1) << 1;
mask2 |= ((d1 & 2) >> 1) << 1;
mask1 |= (d2 & 1) << 9;
mask2 |= ((d2 & 2) >> 1) << 9;
d1 = diffType(v, (C[0] >> (3 * 8)) & 0xff, th);
d2 = diffType(v, (C[2] >> (3 * 8)) & 0xff, th);
/*if ((d1 | d2) == 0)
return;*/
mask1 |= (d1 & 1) << 3;
mask2 |= ((d1 & 2) >> 1) << 3;
mask1 |= (d2 & 1) << 11;
mask2 |= ((d2 & 2) >> 1) << 11;
d1 = diffType(v, (C[1] >> (1 * 8)) & 0xff, th);
d2 = diffType(v, (C[3] >> (1 * 8)) & 0xff, th);
/*if ((d1 | d2) == 0)
return;*/
mask1 |= (d1 & 1) << 5;
mask2 |= ((d1 & 2) >> 1) << 5;
mask1 |= (d2 & 1) << 13;
mask2 |= ((d2 & 2) >> 1) << 13;
d1 = diffType(v, (C[1] >> (3 * 8)) & 0xff, th);
d2 = diffType(v, (C[3] >> (3 * 8)) & 0xff, th);
mask1 |= (d1 & 1) << 7;
mask2 |= ((d1 & 2) >> 1) << 7;
mask1 |= (d2 & 1) << 15;
mask2 |= ((d2 & 2) >> 1) << 15;
}
// 1 -> v > x + th
// 2 -> v < x - th
// 0 -> not a keypoint
__device__ __forceinline__ bool isKeyPoint(int mask1, int mask2)
{
return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) ||
(__popc(mask2) > 8 && (c_table[(mask2 >> 3) - 63] & (1 << (mask2 & 7))));
}
__device__ int cornerScore(const uint C[4], const int v, const int threshold)
{
// binary search in [threshold + 1, 255]
int min = threshold + 1;
int max = 255;
while (min <= max)
{
const int mid = (min + max) >> 1;
int mask1 = 0;
int mask2 = 0;
calcMask(C, v, mid, mask1, mask2);
int isKp = static_cast<int>(isKeyPoint(mask1, mask2));
min = isKp * (mid + 1) + (isKp ^ 1) * min;
max = (isKp ^ 1) * (mid - 1) + isKp * max;
}
return min - 1;
}
template <bool calcScore, class Mask>
__global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
{
2011-12-28 08:56:19 +01:00
#if __CUDA_ARCH__ >= 110
2011-12-27 10:33:20 +01:00
const int j = threadIdx.x + blockIdx.x * blockDim.x + 3;
const int i = threadIdx.y + blockIdx.y * blockDim.y + 3;
if (i < img.rows - 3 && j < img.cols - 3 && mask(i, j))
{
int v;
uint C[4] = {0,0,0,0};
C[2] |= static_cast<uint>(img(i - 3, j - 1)) << 8;
C[2] |= static_cast<uint>(img(i - 3, j));
C[1] |= static_cast<uint>(img(i - 3, j + 1)) << (3 * 8);
C[2] |= static_cast<uint>(img(i - 2, j - 2)) << (2 * 8);
C[1] |= static_cast<uint>(img(i - 2, j + 2)) << (2 * 8);
C[2] |= static_cast<uint>(img(i - 1, j - 3)) << (3 * 8);
C[1] |= static_cast<uint>(img(i - 1, j + 3)) << 8;
C[3] |= static_cast<uint>(img(i, j - 3));
v = static_cast<int>(img(i, j));
C[1] |= static_cast<uint>(img(i, j + 3));
int d1 = diffType(v, C[1] & 0xff, threshold);
int d2 = diffType(v, C[3] & 0xff, threshold);
if ((d1 | d2) == 0)
return;
C[3] |= static_cast<uint>(img(i + 1, j - 3)) << 8;
C[0] |= static_cast<uint>(img(i + 1, j + 3)) << (3 * 8);
C[3] |= static_cast<uint>(img(i + 2, j - 2)) << (2 * 8);
C[0] |= static_cast<uint>(img(i + 2, j + 2)) << (2 * 8);
C[3] |= static_cast<uint>(img(i + 3, j - 1)) << (3 * 8);
C[0] |= static_cast<uint>(img(i + 3, j));
C[0] |= static_cast<uint>(img(i + 3, j + 1)) << 8;
int mask1 = 0;
int mask2 = 0;
calcMask(C, v, threshold, mask1, mask2);
if (isKeyPoint(mask1, mask2))
{
if (calcScore) score(i, j) = cornerScore(C, v, threshold);
const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1));
if (ind < maxKeypoints)
kpLoc[ind] = make_short2(j, i);
}
}
2011-12-28 08:56:19 +01:00
#endif
2011-12-27 10:33:20 +01:00
}
int calcKeypoints_gpu(DevMem2Db img, DevMem2Db mask, short2* kpLoc, int maxKeypoints, DevMem2Di score, int threshold)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
dim3 block(32, 8);
dim3 grid;
grid.x = divUp(img.cols - 6, block.x);
grid.y = divUp(img.rows - 6, block.y);
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
if (score.data)
{
if (mask.data)
calcKeypoints<true><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
else
calcKeypoints<true><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
}
else
{
if (mask.data)
calcKeypoints<false><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
else
calcKeypoints<false><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
return count;
}
///////////////////////////////////////////////////////////////////////////
// nonmaxSupression
__global__ void nonmaxSupression(const short2* kpLoc, int count, const DevMem2Di scoreMat, short2* locFinal, float* responseFinal)
{
2011-12-28 08:56:19 +01:00
#if __CUDA_ARCH__ >= 110
2011-12-27 10:33:20 +01:00
const int kpIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (kpIdx < count)
{
short2 loc = kpLoc[kpIdx];
int score = scoreMat(loc.y, loc.x);
bool ismax =
score > scoreMat(loc.y - 1, loc.x - 1) &&
score > scoreMat(loc.y - 1, loc.x ) &&
score > scoreMat(loc.y - 1, loc.x + 1) &&
score > scoreMat(loc.y , loc.x - 1) &&
score > scoreMat(loc.y , loc.x + 1) &&
score > scoreMat(loc.y + 1, loc.x - 1) &&
score > scoreMat(loc.y + 1, loc.x ) &&
score > scoreMat(loc.y + 1, loc.x + 1);
if (ismax)
{
const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1));
locFinal[ind] = loc;
responseFinal[ind] = static_cast<float>(score);
}
}
2011-12-28 08:56:19 +01:00
#endif
2011-12-27 10:33:20 +01:00
}
int nonmaxSupression_gpu(const short2* kpLoc, int count, DevMem2Di score, short2* loc, float* response)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
dim3 block(256);
dim3 grid;
grid.x = divUp(count, block.x);
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
nonmaxSupression<<<grid, block>>>(kpLoc, count, score, loc, response);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int new_count;
cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
return new_count;
}
} // namespace fast
}}}