Merge pull request #1346 from jet47:gpu-tvl1-optimization

This commit is contained in:
Roman Donchenko 2013-08-27 12:36:44 +04:00 committed by OpenCV Buildbot
commit 0e32d7ccb3
3 changed files with 26 additions and 11 deletions

View File

@ -427,8 +427,8 @@ PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1,
TEST_CYCLE() d_alg(d_frame0, d_frame1, u, v); TEST_CYCLE() d_alg(d_frame0, d_frame1, u, v);
GPU_SANITY_CHECK(u, 1e-2); GPU_SANITY_CHECK(u, 1e-1);
GPU_SANITY_CHECK(v, 1e-2); GPU_SANITY_CHECK(v, 1e-1);
} }
else else
{ {

View File

@ -211,7 +211,7 @@ namespace tvl1flow
const PtrStepf grad, const PtrStepf rho_c, const PtrStepf grad, const PtrStepf rho_c,
const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22, const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22,
PtrStepf u1, PtrStepf u2, PtrStepf error, PtrStepf u1, PtrStepf u2, PtrStepf error,
const float l_t, const float theta) const float l_t, const float theta, const bool calcError)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -265,21 +265,24 @@ namespace tvl1flow
u1(y, x) = u1NewVal; u1(y, x) = u1NewVal;
u2(y, x) = u2NewVal; u2(y, x) = u2NewVal;
const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal); if (calcError)
const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal); {
error(y, x) = n1 + n2; const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
error(y, x) = n1 + n2;
}
} }
void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy, void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
PtrStepSzf grad, PtrStepSzf rho_c, PtrStepSzf grad, PtrStepSzf rho_c,
PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
float l_t, float theta) float l_t, float theta, bool calcError)
{ {
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y)); const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta); estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta, calcError);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );

View File

@ -173,7 +173,7 @@ namespace tvl1flow
PtrStepSzf grad, PtrStepSzf rho_c, PtrStepSzf grad, PtrStepSzf rho_c,
PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
float l_t, float theta); float l_t, float theta, bool calcError);
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut); void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut);
} }
@ -218,12 +218,24 @@ void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const Gpu
warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c); warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
double error = numeric_limits<double>::max(); double error = numeric_limits<double>::max();
double prevError = 0.0;
for (int n = 0; error > scaledEpsilon && n < iterations; ++n) for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
{ {
estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta)); // some tweaks to make sum operation less frequently
bool calcError = (epsilon > 0) && (n & 0x1) && (prevError < scaledEpsilon);
if (epsilon > 0) estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta), calcError);
if (calcError)
{
error = gpu::sum(diff, norm_buf)[0]; error = gpu::sum(diff, norm_buf)[0];
prevError = error;
}
else
{
error = numeric_limits<double>::max();
prevError -= scaledEpsilon;
}
estimateDualVariables(u1, u2, p11, p12, p21, p22, taut); estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
} }