gpu::HoughLines : minor code improvements

This commit is contained in:
Vladislav Vinogradov 2012-08-20 16:03:01 +04:00
parent 1e4012079d
commit c26d543e1e
5 changed files with 129 additions and 117 deletions

View File

@ -820,6 +820,7 @@ private:
int nLayers_;
};
//! HoughLines
CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta);

View File

@ -1626,7 +1626,7 @@ PERF_TEST_P(Sz_DoSort, ImgProc_HoughLines, Combine(GPU_TYPICAL_MAT_SIZES, Bool()
cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
const int numLines = rng.uniform(500, 2000);
const int numLines = rng.uniform(100, 300);
for (int i = 0; i < numLines; ++i)
{
cv::Point p1(rng.uniform(0, src.cols), rng.uniform(0, src.rows));

View File

@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
{
__shared__ int s_queues[4][32 * PIXELS_PER_THREAD];
__shared__ int s_qsize[4];
__shared__ int s_start[4];
__shared__ int s_globStart[4];
const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -73,9 +73,10 @@ namespace cv { namespace gpu { namespace device
__syncthreads();
// fill the queue
const uchar* srcRow = src.ptr(y);
for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
{
if (src(y, xx))
if (srcRow[xx])
{
const unsigned int val = (y << 16) | xx;
const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
@ -89,36 +90,34 @@ namespace cv { namespace gpu { namespace device
if (threadIdx.x == 0 && threadIdx.y == 0)
{
// find how many items are stored in each list
int total_size = 0;
int totalSize = 0;
for (int i = 0; i < blockDim.y; ++i)
{
s_start[i] = total_size;
total_size += s_qsize[i];
s_globStart[i] = totalSize;
totalSize += s_qsize[i];
}
// calculate the offset in the global list
const int global_offset = atomicAdd(&g_counter, total_size);
const int globalOffset = atomicAdd(&g_counter, totalSize);
for (int i = 0; i < blockDim.y; ++i)
s_start[i] += global_offset;
s_globStart[i] += globalOffset;
}
__syncthreads();
// copy local queues to global queue
const int qsize = s_qsize[threadIdx.y];
for(int i = threadIdx.x; i < qsize; i += blockDim.x)
{
const unsigned int val = s_queues[threadIdx.y][i];
list[s_start[threadIdx.y] + i] = val;
}
int gidx = s_globStart[threadIdx.y] + threadIdx.x;
for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
list[gidx] = s_queues[threadIdx.y][i];
}
int buildPointList_gpu(DevMem2Db src, unsigned int* list)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
void* counterPtr;
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
const dim3 block(32, 4);
const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() );
int total_count;
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
int totalCount;
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
return total_count;
return totalCount;
}
////////////////////////////////////////////////////////////////////////
@ -144,24 +143,26 @@ namespace cv { namespace gpu { namespace device
const int n = blockIdx.x;
const float ang = n * theta;
float sin_ang;
float cos_ang;
sincosf(ang, &sin_ang, &cos_ang);
float sinVal;
float cosVal;
sincosf(ang, &sinVal, &cosVal);
sinVal *= irho;
cosVal *= irho;
const float tabSin = sin_ang * irho;
const float tabCos = cos_ang * irho;
const int shift = (numrho - 1) / 2;
int* accumRow = accum.ptr(n + 1);
for (int i = threadIdx.x; i < count; i += blockDim.x)
{
const unsigned int qvalue = list[i];
const unsigned int val = list[i];
const int x = (qvalue & 0x0000FFFF);
const int y = (qvalue >> 16) & 0x0000FFFF;
const int x = (val & 0xFFFF);
const int y = (val >> 16) & 0xFFFF;
int r = __float2int_rn(x * tabCos + y * tabSin);
r += (numrho - 1) / 2;
int r = __float2int_rn(x * cosVal + y * sinVal);
r += shift;
::atomicAdd(accum.ptr(n + 1) + r + 1, 1);
::atomicAdd(accumRow + r + 1, 1);
}
}
@ -177,30 +178,32 @@ namespace cv { namespace gpu { namespace device
const int n = blockIdx.x;
const float ang = n * theta;
float sin_ang;
float cos_ang;
sincosf(ang, &sin_ang, &cos_ang);
float sinVal;
float cosVal;
sincosf(ang, &sinVal, &cosVal);
sinVal *= irho;
cosVal *= irho;
const float tabSin = sin_ang * irho;
const float tabCos = cos_ang * irho;
const int shift = (numrho - 1) / 2;
for (int i = threadIdx.x; i < count; i += blockDim.x)
{
const unsigned int qvalue = list[i];
const unsigned int val = list[i];
const int x = (qvalue & 0x0000FFFF);
const int y = (qvalue >> 16) & 0x0000FFFF;
const int x = (val & 0xFFFF);
const int y = (val >> 16) & 0xFFFF;
int r = __float2int_rn(x * tabCos + y * tabSin);
r += (numrho - 1) / 2;
int r = __float2int_rn(x * cosVal + y * sinVal);
r += shift;
Emulation::smem::atomicAdd(&smem[r + 1], 1);
}
__syncthreads();
for (int i = threadIdx.x; i < numrho; i += blockDim.x)
accum(n + 1, i) = smem[i];
int* accumRow = accum.ptr(n + 1);
for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
accumRow[i] = smem[i];
}
void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
@ -225,21 +228,21 @@ namespace cv { namespace gpu { namespace device
////////////////////////////////////////////////////////////////////////
// linesGetResult
__global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float threshold, const float theta, const float rho, const int numrho)
__global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const float threshold, const int numrho)
{
__shared__ int smem[8][32];
int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
const int x = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
const int y = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
if (r >= accum.cols || n >= accum.rows)
if (x >= accum.cols || y >= accum.rows)
return;
smem[threadIdx.y][threadIdx.x] = accum(n, r);
smem[threadIdx.y][threadIdx.x] = accum(y, x);
__syncthreads();
r -= 1;
n -= 1;
const int r = x - 1;
const int n = y - 1;
if (threadIdx.x == 0 || threadIdx.x == blockDim.x - 1 || threadIdx.y == 0 || threadIdx.y == blockDim.y - 1 || r >= accum.cols - 2 || n >= accum.rows - 2)
return;
@ -264,32 +267,32 @@ namespace cv { namespace gpu { namespace device
int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
void* counterPtr;
cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
const dim3 block(32, 8);
const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2));
linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, threshold, theta, rho, accum.cols - 2);
linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
int total_count;
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
int totalCount;
cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
total_count = ::min(total_count, maxSize);
totalCount = ::min(totalCount, maxSize);
if (doSort && total_count > 0)
if (doSort && totalCount > 0)
{
thrust::device_ptr<float2> out_ptr(out);
thrust::device_ptr<int> votes_ptr(votes);
thrust::sort_by_key(votes_ptr, votes_ptr + total_count, out_ptr, thrust::greater<int>());
thrust::device_ptr<float2> outPtr(out);
thrust::device_ptr<int> votesPtr(votes);
thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
}
return total_count;
return totalCount;
}
}
}}}

View File

@ -57,11 +57,27 @@ namespace cv { namespace gpu { namespace device
namespace hough
{
int buildPointList_gpu(DevMem2Db src, unsigned int* list);
void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort);
}
}}}
//////////////////////////////////////////////////////////
// HoughLines
void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
{
GpuMat accum, buf;
HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
}
void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
{
HoughLinesTransform(src, accum, buf, rho, theta);
HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
}
void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta)
{
using namespace cv::gpu::device::hough;
@ -80,23 +96,23 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
CV_Assert(numangle > 0 && numrho > 0);
ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum);
accum.setTo(cv::Scalar::all(0));
accum.setTo(Scalar::all(0));
cv::gpu::DeviceInfo devInfo;
DeviceInfo devInfo;
if (count > 0)
linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(cv::gpu::FEATURE_SET_COMPUTE_20));
linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
}
void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
{
using namespace cv::gpu::device;
using namespace cv::gpu::device::hough;
CV_Assert(accum.type() == CV_32SC1);
ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
int count = linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
if (count > 0)
lines.cols = count;
@ -104,18 +120,6 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
lines.release();
}
void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
{
cv::gpu::GpuMat accum, buf;
HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
}
void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
{
HoughLinesTransform(src, accum, buf, rho, theta);
HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
}
void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_votes_)
{
if (d_lines.empty())
@ -129,14 +133,14 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou
CV_Assert(d_lines.rows == 2 && d_lines.type() == CV_32FC2);
h_lines_.create(1, d_lines.cols, CV_32FC2);
cv::Mat h_lines = h_lines_.getMat();
Mat h_lines = h_lines_.getMat();
d_lines.row(0).download(h_lines);
if (h_votes_.needed())
{
h_votes_.create(1, d_lines.cols, CV_32SC1);
cv::Mat h_votes = h_votes_.getMat();
cv::gpu::GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
Mat h_votes = h_votes_.getMat();
GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
d_votes.download(h_votes);
}
}

View File

@ -1129,63 +1129,67 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerMinEigen, testing::Combine(
///////////////////////////////////////////////////////////////////////////////////////////////////////
// HoughLines
PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, std::string)
PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, cv::Size, UseRoi)
{
};
void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
{
for (size_t i = 0; i < lines.size(); ++i)
void generateLines(cv::Mat& img)
{
float rho = lines[i][0], theta = lines[i][1];
cv::Point pt1, pt2;
double a = std::cos(theta), b = std::sin(theta);
double x0 = a*rho, y0 = b*rho;
pt1.x = cvRound(x0 + 1000*(-b));
pt1.y = cvRound(y0 + 1000*(a));
pt2.x = cvRound(x0 - 1000*(-b));
pt2.y = cvRound(y0 - 1000*(a));
cv::line(dst, pt1, pt2, cv::Scalar::all(255));
img.setTo(cv::Scalar::all(0));
cv::line(img, cv::Point(20, 0), cv::Point(20, img.rows), cv::Scalar::all(255));
cv::line(img, cv::Point(0, 50), cv::Point(img.cols, 50), cv::Scalar::all(255));
cv::line(img, cv::Point(0, 0), cv::Point(img.cols, img.rows), cv::Scalar::all(255));
cv::line(img, cv::Point(img.cols, 0), cv::Point(0, img.rows), cv::Scalar::all(255));
}
}
void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
{
dst.setTo(cv::Scalar::all(0));
for (size_t i = 0; i < lines.size(); ++i)
{
float rho = lines[i][0], theta = lines[i][1];
cv::Point pt1, pt2;
double a = std::cos(theta), b = std::sin(theta);
double x0 = a*rho, y0 = b*rho;
pt1.x = cvRound(x0 + 1000*(-b));
pt1.y = cvRound(y0 + 1000*(a));
pt2.x = cvRound(x0 - 1000*(-b));
pt2.y = cvRound(y0 - 1000*(a));
cv::line(dst, pt1, pt2, cv::Scalar::all(255));
}
}
};
TEST_P(HoughLines, Accuracy)
{
const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
cv::gpu::setDevice(devInfo.deviceID());
const std::string fileName = GET_PARAM(1);
const cv::Size size = GET_PARAM(1);
const bool useRoi = GET_PARAM(2);
const float rho = 1.0f;
const float theta = static_cast<float>(CV_PI / 180);
const int threshold = 50;
const float theta = 1.5f * CV_PI / 180.0f;
const int threshold = 100;
cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
cv::Mat edges;
cv::Canny(img, edges, 50, 200);
cv::Mat src(size, CV_8UC1);
generateLines(src);
cv::gpu::GpuMat d_lines;
cv::gpu::HoughLines(loadMat(edges), d_lines, rho, theta, threshold);
cv::gpu::HoughLines(loadMat(src, useRoi), d_lines, rho, theta, threshold);
std::vector<cv::Vec2f> lines;
cv::gpu::HoughLinesDownload(d_lines, lines);
cv::Mat dst(img.size(), CV_8UC1, cv::Scalar::all(0));
cv::Mat dst(size, CV_8UC1);
drawLines(dst, lines);
std::vector<cv::Vec2f> lines_gold;
cv::HoughLines(edges, lines_gold, rho, theta, threshold);
cv::Mat dst_gold(img.size(), CV_8UC1, cv::Scalar::all(0));
drawLines(dst_gold, lines_gold);
ASSERT_MAT_NEAR(dst_gold, dst, 0.0);
ASSERT_MAT_NEAR(src, dst, 0.0);
}
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine(
ALL_DEVICES,
testing::Values(std::string("../cv/shared/pic1.png"),
std::string("../cv/shared/pic3.png"),
std::string("../cv/shared/pic5.png"),
std::string("../cv/shared/pic6.png"))));
DIFFERENT_SIZES,
WHOLE_SUBMAT));
} // namespace