performance issue for cuda TVL1 when gamma = 0

This commit is contained in:
Ernest Galbrun 2014-07-28 14:24:21 +02:00
parent df8f1a4355
commit 5623701acb
2 changed files with 55 additions and 34 deletions

View File

@ -226,7 +226,7 @@ namespace tvl1flow
const float gradVal = grad(y, x);
const float u1OldVal = u1(y, x);
const float u2OldVal = u2(y, x);
const float u3OldVal = u3(y, x);
const float u3OldVal = gamma ? u3(y, x) : 0;
const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal + gamma * u3OldVal);
@ -240,20 +240,23 @@ namespace tvl1flow
{
d1 = l_t * I1wxVal;
d2 = l_t * I1wyVal;
d3 = l_t * gamma;
if (gamma)
d3 = l_t * gamma;
}
else if (rho > l_t * gradVal)
{
d1 = -l_t * I1wxVal;
d2 = -l_t * I1wyVal;
d3 = -l_t * gamma;
if (gamma)
d3 = -l_t * gamma;
}
else if (gradVal > numeric_limits<float>::epsilon())
{
const float fi = -rho / gradVal;
d1 = fi * I1wxVal;
d2 = fi * I1wyVal;
d3 = fi * gamma;
if (gamma)
d3 = fi * gamma;
}
const float v1 = u1OldVal + d1;
@ -264,24 +267,24 @@ namespace tvl1flow
const float div_p1 = divergence(p11, p12, y, x);
const float div_p2 = divergence(p21, p22, y, x);
const float div_p3 = divergence(p31, p32, y, x);
const float div_p3 = gamma ? divergence(p31, p32, y, x) : 0;
// estimate the values of the optical flow (u1, u2)
const float u1NewVal = v1 + theta * div_p1;
const float u2NewVal = v2 + theta * div_p2;
const float u3NewVal = v3 + theta * div_p3;
const float u3NewVal = gamma ? v3 + theta * div_p3 : 0;
u1(y, x) = u1NewVal;
u2(y, x) = u2NewVal;
u3(y, x) = u3NewVal;
if (gamma)
u3(y, x) = u3NewVal;
if (calcError)
{
const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
const float n3 = 0;// (u3OldVal - u3NewVal) * (u3OldVal - u3NewVal);
error(y, x) = n1 + n2 + n3;
error(y, x) = n1 + n2;
}
}
@ -307,7 +310,7 @@ namespace tvl1flow
namespace tvl1flow
{
__global__ void estimateDualVariablesKernel(const PtrStepSzf u1, const PtrStepf u2, const PtrStepSzf u3,
PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, PtrStepf p31, PtrStepf p32, const float taut)
PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, PtrStepf p31, PtrStepf p32, const float taut, const float gamma)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -321,31 +324,34 @@ namespace tvl1flow
const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x);
const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x);
const float u3x = u3(y, ::min(x + 1, u1.cols - 1)) - u3(y, x);
const float u3y = u3(::min(y + 1, u1.rows - 1), x) - u3(y, x);
const float u3x = gamma ? u3(y, ::min(x + 1, u1.cols - 1)) - u3(y, x) : 0;
const float u3y = gamma ? u3(::min(y + 1, u1.rows - 1), x) - u3(y, x) : 0;
const float g1 = ::hypotf(u1x, u1y);
const float g2 = ::hypotf(u2x, u2y);
const float g3 = ::hypotf(u3x, u3y);
const float g3 = gamma ? ::hypotf(u3x, u3y) : 0;
const float ng1 = 1.0f + taut * g1;
const float ng2 = 1.0f + taut * g2;
const float ng3 = 1.0f + taut * g3;
const float ng3 = gamma ? 1.0f + taut * g3 : 0;
p11(y, x) = (p11(y, x) + taut * u1x) / ng1;
p12(y, x) = (p12(y, x) + taut * u1y) / ng1;
p21(y, x) = (p21(y, x) + taut * u2x) / ng2;
p22(y, x) = (p22(y, x) + taut * u2y) / ng2;
p31(y, x) = (p31(y, x) + taut * u3x) / ng3;
p32(y, x) = (p32(y, x) + taut * u3y) / ng3;
if (gamma)
{
p31(y, x) = (p31(y, x) + taut * u3x) / ng3;
p32(y, x) = (p32(y, x) + taut * u3y) / ng3;
}
}
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut)
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, float gamma)
{
const dim3 block(32, 8);
const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
estimateDualVariablesKernel<<<grid, block>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut);
estimateDualVariablesKernel<<<grid, block>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );

View File

@ -94,7 +94,8 @@ void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const Gp
u1s[0] = flowx;
u2s[0] = flowy;
u3s[0].create(I0.size(), CV_32FC1);
if (gamma)
u3s[0].create(I0.size(), CV_32FC1);
I1x_buf.create(I0.size(), CV_32FC1);
I1y_buf.create(I0.size(), CV_32FC1);
@ -110,9 +111,11 @@ void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const Gp
p12_buf.create(I0.size(), CV_32FC1);
p21_buf.create(I0.size(), CV_32FC1);
p22_buf.create(I0.size(), CV_32FC1);
p31_buf.create(I0.size(), CV_32FC1);
p32_buf.create(I0.size(), CV_32FC1);
if (gamma)
{
p31_buf.create(I0.size(), CV_32FC1);
p32_buf.create(I0.size(), CV_32FC1);
}
diff_buf.create(I0.size(), CV_32FC1);
// create the scales
@ -140,7 +143,8 @@ void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const Gp
u1s[s].create(I0s[s].size(), CV_32FC1);
u2s[s].create(I0s[s].size(), CV_32FC1);
}
u3s[s].create(I0s[s].size(), CV_32FC1);
if (gamma)
u3s[s].create(I0s[s].size(), CV_32FC1);
}
if (!useInitialFlow)
@ -148,7 +152,8 @@ void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const Gp
u1s[nscales-1].setTo(Scalar::all(0));
u2s[nscales-1].setTo(Scalar::all(0));
}
u3s[nscales - 1].setTo(Scalar::all(0));
if (gamma)
u3s[nscales - 1].setTo(Scalar::all(0));
// pyramidal structure for computing the optical flow
for (int s = nscales - 1; s >= 0; --s)
@ -165,7 +170,8 @@ void cv::cuda::OpticalFlowDual_TVL1_CUDA::operator ()(const GpuMat& I0, const Gp
// zoom the optical flow for the next finer scale
cuda::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
cuda::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size());
if (gamma)
cuda::resize(u3s[s], u3s[s - 1], I0s[s - 1].size());
// scale the optical flow with the appropriate zoom factor
cuda::multiply(u1s[s - 1], Scalar::all(1/scaleStep), u1s[s - 1]);
@ -182,7 +188,7 @@ namespace tvl1flow
PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
float l_t, float theta, float gamma, bool calcError);
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut);
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, const float gamma);
}
void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3)
@ -211,14 +217,21 @@ void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat& I0, const G
GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows));
GpuMat p31, p32;
if (gamma)
{
p31 = p31_buf(Rect(0, 0, I0.cols, I0.rows));
p32 = p32_buf(Rect(0, 0, I0.cols, I0.rows));
}
p11.setTo(Scalar::all(0));
p12.setTo(Scalar::all(0));
p21.setTo(Scalar::all(0));
p22.setTo(Scalar::all(0));
p31.setTo(Scalar::all(0));
p32.setTo(Scalar::all(0));
if (gamma)
{
p31.setTo(Scalar::all(0));
p32.setTo(Scalar::all(0));
}
GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
@ -248,7 +261,7 @@ void cv::cuda::OpticalFlowDual_TVL1_CUDA::procOneScale(const GpuMat& I0, const G
prevError -= scaledEpsilon;
}
estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut);
estimateDualVariables(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
}
}
}
@ -275,9 +288,11 @@ void cv::cuda::OpticalFlowDual_TVL1_CUDA::collectGarbage()
p12_buf.release();
p21_buf.release();
p22_buf.release();
p31_buf.release();
p32_buf.release();
if (gamma)
{
p31_buf.release();
p32_buf.release();
}
diff_buf.release();
norm_buf.release();
}