reintegrate warp shuffle based integral
This commit is contained in:
@@ -361,14 +361,8 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
{
|
{
|
||||||
// each thread handles 16 values, use 1 block/row
|
// each thread handles 16 values, use 1 block/row
|
||||||
int block = img.cols / 16;
|
|
||||||
|
|
||||||
// save, becouse step is actually can't be less 512 bytes
|
// save, becouse step is actually can't be less 512 bytes
|
||||||
int align = img.cols % 4;
|
int block = integral.cols / 16;
|
||||||
if ( align != 0)
|
|
||||||
{
|
|
||||||
block += (4 - align);
|
|
||||||
}
|
|
||||||
|
|
||||||
// launch 1 block / row
|
// launch 1 block / row
|
||||||
const int grid = img.rows;
|
const int grid = img.rows;
|
||||||
|
@@ -553,44 +553,25 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
|
|||||||
|
|
||||||
src.locateROI(whole, offset);
|
src.locateROI(whole, offset);
|
||||||
|
|
||||||
if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048)
|
if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048 && offset.x % 16 == 0 && (src.cols + 63) / 64 <= (src.step - offset.x))
|
||||||
{
|
{
|
||||||
GpuMat srcAlligned;
|
ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
|
||||||
|
|
||||||
if (src.cols % 16 == 0 && src.rows % 8 == 0 && offset.x % 16 == 0 && offset.y % 8 == 0)
|
cv::gpu::device::imgproc::shfl_integral_gpu(src, buffer, stream);
|
||||||
srcAlligned = src;
|
|
||||||
else
|
|
||||||
{
|
|
||||||
ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 15) / 16) * 16, src.type(), buffer);
|
|
||||||
|
|
||||||
GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows));
|
|
||||||
|
|
||||||
if (s)
|
|
||||||
{
|
|
||||||
s.enqueueMemSet(buffer, Scalar::all(0));
|
|
||||||
s.enqueueCopy(src, inner);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
buffer.setTo(Scalar::all(0));
|
|
||||||
src.copyTo(inner);
|
|
||||||
}
|
|
||||||
|
|
||||||
srcAlligned = buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
sum.create(srcAlligned.rows + 1, srcAlligned.cols + 4, CV_32SC1);
|
|
||||||
|
|
||||||
|
sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
|
||||||
if (s)
|
if (s)
|
||||||
s.enqueueMemSet(sum, Scalar::all(0));
|
s.enqueueMemSet(sum, Scalar::all(0));
|
||||||
else
|
else
|
||||||
sum.setTo(Scalar::all(0));
|
sum.setTo(Scalar::all(0));
|
||||||
|
|
||||||
GpuMat inner = sum(Rect(4, 1, srcAlligned.cols, srcAlligned.rows));
|
GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
|
||||||
|
GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
|
||||||
|
|
||||||
cv::gpu::device::imgproc::shfl_integral_gpu(srcAlligned, inner, stream);
|
if (s)
|
||||||
|
s.enqueueCopy(res, inner);
|
||||||
sum = sum(Rect(3, 0, src.cols + 1, src.rows + 1));
|
else
|
||||||
|
res.copyTo(inner);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user