optimizations:

- new reduce implementation (with kepler optimizations)
- saturate_cast via asm command
- video SIMD instructions in element operations
- float arithmetics instead of double
- new deviceSupports function
This commit is contained in:
Vladislav Vinogradov
2013-01-23 14:43:36 +04:00
parent ae6266e101
commit 281d036fcf
45 changed files with 9379 additions and 8846 deletions

View File

@@ -53,35 +53,32 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, Gpu
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace device
namespace pyrlk
{
namespace pyrlk
{
void loadConstants(int2 winSize, int iters);
void loadConstants(int2 winSize, int iters);
void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
}
}}}
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
}
namespace
{
void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch, bool isDeviceArch11)
void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
{
if (winSize.width > 32 && winSize.width > 2 * winSize.height)
{
block.x = isDeviceArch11 ? 16 : 32;
block.x = deviceSupports(FEATURE_SET_COMPUTE_12) ? 32 : 16;
block.y = 8;
}
else
{
block.x = 16;
block.y = isDeviceArch11 ? 8 : 16;
block.y = deviceSupports(FEATURE_SET_COMPUTE_12) ? 16 : 8;
}
patch.x = (winSize.width + block.x - 1) / block.x;
@@ -93,8 +90,6 @@ namespace
void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
{
using namespace cv::gpu::device::pyrlk;
if (prevPts.empty())
{
nextPts.release();
@@ -104,9 +99,9 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
}
dim3 block, patch;
calcPatchSize(winSize, block, patch, isDeviceArch11_);
calcPatchSize(winSize, block, patch);
CV_Assert(prevImg.type() == CV_8UC1 || prevImg.type() == CV_8UC3 || prevImg.type() == CV_8UC4);
CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
CV_Assert(maxLevel >= 0);
CV_Assert(winSize.width > 2 && winSize.height > 2);
@@ -155,19 +150,19 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
pyrDown(nextPyr_[level - 1], nextPyr_[level]);
}
loadConstants(make_int2(winSize.width, winSize.height), iters);
pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
for (int level = maxLevel; level >= 0; level--)
{
if (cn == 1)
{
lkSparse1_gpu(prevPyr_[level], nextPyr_[level],
pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch);
}
else
{
lkSparse4_gpu(prevPyr_[level], nextPyr_[level],
pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch);
}
@@ -176,8 +171,6 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
{
using namespace cv::gpu::device::pyrlk;
CV_Assert(prevImg.type() == CV_8UC1);
CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
CV_Assert(maxLevel >= 0);
@@ -211,7 +204,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
vPyr_[1].setTo(Scalar::all(0));
int2 winSize2i = make_int2(winSize.width, winSize.height);
loadConstants(winSize2i, iters);
pyrlk::loadConstants(winSize2i, iters);
PtrStepSzf derr = err ? *err : PtrStepSzf();
@@ -221,7 +214,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
{
int idx2 = (idx + 1) & 1;
lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
level == 0 ? derr : PtrStepSzf(), winSize2i);
if (level > 0)