Merge pull requst #177 from cuda-geek/another-one-integral-fix
This commit is contained in:
commit
39da17a02a
@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ bool check(int, int, int, uint offset = 0)
|
static __device__ __forceinline__ bool check(int, int, int)
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -357,18 +357,19 @@ namespace cv { namespace gpu { namespace device
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
|
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
// each thread handles 16 values, use 1 block/row
|
// each thread handles 16 values, use 1 block/row
|
||||||
const int block = img.cols / 16;
|
// save, becouse step is actually can't be less 512 bytes
|
||||||
|
int block = integral.cols / 16;
|
||||||
|
|
||||||
// launch 1 block / row
|
// launch 1 block / row
|
||||||
const int grid = img.rows;
|
const int grid = img.rows;
|
||||||
|
|
||||||
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
|
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
|
||||||
|
|
||||||
shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
|
shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
|
||||||
cudaSafeCall( cudaGetLastError() );
|
cudaSafeCall( cudaGetLastError() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -185,6 +185,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
||||||
{
|
{
|
||||||
|
if (!ncandidates) return;
|
||||||
int block = ncandidates;
|
int block = ncandidates;
|
||||||
int smem = block * ( sizeof(int) + sizeof(int4) );
|
int smem = block * ( sizeof(int) + sizeof(int4) );
|
||||||
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
|
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
|
||||||
|
@ -177,7 +177,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
|
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace, uint sumOffset)
|
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
|
||||||
{
|
{
|
||||||
// Determine the indices
|
// Determine the indices
|
||||||
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
|
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
|
||||||
@ -198,9 +198,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
|
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
|
||||||
{
|
{
|
||||||
const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), sumOffset + (j << c_octave));
|
const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
|
||||||
const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), sumOffset + (j << c_octave));
|
const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
|
||||||
const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), sumOffset + (j << c_octave));
|
const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
|
||||||
|
|
||||||
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
|
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
|
||||||
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
|
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
|
||||||
@ -208,7 +208,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
|
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
|
||||||
int octave, int nOctaveLayers, const size_t sumOffset)
|
int octave, int nOctaveLayers)
|
||||||
{
|
{
|
||||||
const int min_size = calcSize(octave, 0);
|
const int min_size = calcSize(octave, 0);
|
||||||
const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
|
const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
|
||||||
@ -220,7 +220,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
grid.x = divUp(max_samples_j, threads.x);
|
grid.x = divUp(max_samples_j, threads.x);
|
||||||
grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
|
grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
|
||||||
|
|
||||||
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace, (uint)sumOffset);
|
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
|
||||||
cudaSafeCall( cudaGetLastError() );
|
cudaSafeCall( cudaGetLastError() );
|
||||||
|
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
cudaSafeCall( cudaDeviceSynchronize() );
|
||||||
@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
struct WithMask
|
struct WithMask
|
||||||
{
|
{
|
||||||
static __device__ bool check(int sum_i, int sum_j, int size, const uint offset)
|
static __device__ bool check(int sum_i, int sum_j, int size)
|
||||||
{
|
{
|
||||||
float ratio = (float)size / 9.0f;
|
float ratio = (float)size / 9.0f;
|
||||||
|
|
||||||
@ -245,10 +245,10 @@ namespace cv { namespace gpu { namespace device
|
|||||||
int dy2 = __float2int_rn(ratio * c_DM[3]);
|
int dy2 = __float2int_rn(ratio * c_DM[3]);
|
||||||
|
|
||||||
float t = 0;
|
float t = 0;
|
||||||
t += tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy1);
|
t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
|
||||||
t -= tex2D(maskSumTex, offset + sum_j + dx1, sum_i + dy2);
|
t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
|
||||||
t -= tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy1);
|
t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
|
||||||
t += tex2D(maskSumTex, offset + sum_j + dx2, sum_i + dy2);
|
t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
|
||||||
|
|
||||||
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
|
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
|
||||||
|
|
||||||
@ -258,7 +258,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
template <typename Mask>
|
template <typename Mask>
|
||||||
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
|
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
|
||||||
unsigned int* maxCounter, const uint maskOffset)
|
unsigned int* maxCounter)
|
||||||
{
|
{
|
||||||
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
|
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
|
||||||
|
|
||||||
@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
|
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
|
||||||
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
|
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
|
||||||
|
|
||||||
if (Mask::check(sum_i, sum_j, size, maskOffset))
|
if (Mask::check(sum_i, sum_j, size))
|
||||||
{
|
{
|
||||||
// Check to see if we have a max (in its 26 neighbours)
|
// Check to see if we have a max (in its 26 neighbours)
|
||||||
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
|
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
|
||||||
@ -351,7 +351,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
||||||
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers, const size_t maskOffset)
|
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
|
||||||
{
|
{
|
||||||
const int layer_rows = img_rows >> octave;
|
const int layer_rows = img_rows >> octave;
|
||||||
const int layer_cols = img_cols >> octave;
|
const int layer_cols = img_cols >> octave;
|
||||||
@ -367,9 +367,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
|
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
|
||||||
|
|
||||||
if (use_mask)
|
if (use_mask)
|
||||||
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter, (uint)maskOffset);
|
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
|
||||||
else
|
else
|
||||||
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter, 0);
|
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
|
||||||
|
|
||||||
cudaSafeCall( cudaGetLastError() );
|
cudaSafeCall( cudaGetLastError() );
|
||||||
|
|
||||||
|
@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
namespace imgproc
|
namespace imgproc
|
||||||
{
|
{
|
||||||
void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
|
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
|
||||||
}
|
}
|
||||||
}}}
|
}}}
|
||||||
|
|
||||||
@ -553,44 +553,26 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
|
|||||||
|
|
||||||
src.locateROI(whole, offset);
|
src.locateROI(whole, offset);
|
||||||
|
|
||||||
if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048)
|
if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
|
||||||
|
&& offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (src.step - offset.x))
|
||||||
{
|
{
|
||||||
GpuMat srcAlligned;
|
ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
|
||||||
|
|
||||||
if (src.cols % 16 == 0 && src.rows % 8 == 0 && offset.x % 16 == 0 && offset.y % 8 == 0)
|
cv::gpu::device::imgproc::shfl_integral_gpu(src, buffer, stream);
|
||||||
srcAlligned = src;
|
|
||||||
else
|
|
||||||
{
|
|
||||||
ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 15) / 16) * 16, src.type(), buffer);
|
|
||||||
|
|
||||||
GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows));
|
|
||||||
|
|
||||||
if (s)
|
|
||||||
{
|
|
||||||
s.enqueueMemSet(buffer, Scalar::all(0));
|
|
||||||
s.enqueueCopy(src, inner);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
buffer.setTo(Scalar::all(0));
|
|
||||||
src.copyTo(inner);
|
|
||||||
}
|
|
||||||
|
|
||||||
srcAlligned = buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
sum.create(srcAlligned.rows + 1, srcAlligned.cols + 4, CV_32SC1);
|
|
||||||
|
|
||||||
|
sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
|
||||||
if (s)
|
if (s)
|
||||||
s.enqueueMemSet(sum, Scalar::all(0));
|
s.enqueueMemSet(sum, Scalar::all(0));
|
||||||
else
|
else
|
||||||
sum.setTo(Scalar::all(0));
|
sum.setTo(Scalar::all(0));
|
||||||
|
|
||||||
GpuMat inner = sum(Rect(4, 1, srcAlligned.cols, srcAlligned.rows));
|
GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
|
||||||
|
GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
|
||||||
|
|
||||||
cv::gpu::device::imgproc::shfl_integral_gpu(srcAlligned, inner, stream);
|
if (s)
|
||||||
|
s.enqueueCopy(res, inner);
|
||||||
sum = sum(Rect(3, 0, src.cols + 1, src.rows + 1));
|
else
|
||||||
|
res.copyTo(inner);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -75,10 +75,10 @@ namespace cv { namespace gpu { namespace device
|
|||||||
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
|
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
|
||||||
|
|
||||||
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
|
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
|
||||||
int octave, int nOctaveLayers, const size_t sumOffset);
|
int octave, int nOctaveLayer);
|
||||||
|
|
||||||
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
||||||
int img_rows, int img_cols, int octave, bool use_mask, int nLayers, const size_t maskOffset);
|
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
|
||||||
|
|
||||||
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
|
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
|
||||||
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
|
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
|
||||||
@ -146,8 +146,8 @@ namespace
|
|||||||
loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
|
loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
|
||||||
|
|
||||||
bindImgTex(img);
|
bindImgTex(img);
|
||||||
integralBuffered(img, surf_.sum, surf_.intBuffer);
|
|
||||||
|
|
||||||
|
integralBuffered(img, surf_.sum, surf_.intBuffer);
|
||||||
sumOffset = bindSumTex(surf_.sum);
|
sumOffset = bindSumTex(surf_.sum);
|
||||||
|
|
||||||
if (use_mask)
|
if (use_mask)
|
||||||
@ -174,10 +174,10 @@ namespace
|
|||||||
|
|
||||||
loadOctaveConstants(octave, layer_rows, layer_cols);
|
loadOctaveConstants(octave, layer_rows, layer_cols);
|
||||||
|
|
||||||
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers, sumOffset);
|
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
|
||||||
|
|
||||||
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
|
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
|
||||||
img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers, maskOffset);
|
img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
|
||||||
|
|
||||||
unsigned int maxCounter;
|
unsigned int maxCounter;
|
||||||
cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||||
|
Loading…
x
Reference in New Issue
Block a user