Modified sparse pyrlk optical flow to allow input of an image pyramid which thus allows caching of image pyramids on successive calls.

Added unsigned char support for 1, 3, 4 channel images.
This commit is contained in:
Dan Moodie 2015-12-29 10:48:14 -05:00
parent 8d79285d02
commit 66738d748f
7 changed files with 818 additions and 213 deletions

View File

@ -116,10 +116,10 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse,
const int levels = GET_PARAM(4); const int levels = GET_PARAM(4);
const int iters = GET_PARAM(5); const int iters = GET_PARAM(5);
const cv::Mat frame0 = readImage(imagePair.first, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame0 = readImage(imagePair.first, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame0.empty()); ASSERT_FALSE(frame0.empty());
const cv::Mat frame1 = readImage(imagePair.second, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame1 = readImage(imagePair.second, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame1.empty()); ASSERT_FALSE(frame1.empty());
cv::Mat gray_frame; cv::Mat gray_frame;
@ -131,6 +131,14 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse,
cv::Mat pts; cv::Mat pts;
cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0); cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
frame0.convertTo(frame0, CV_32F);
frame1.convertTo(frame1, CV_32F);
if(!useGray)
{
cv::cvtColor(frame0, frame0, cv::COLOR_BGR2BGRA);
cv::cvtColor(frame1, frame1, cv::COLOR_BGR2BGRA);
}
if (PERF_RUN_CUDA()) if (PERF_RUN_CUDA())
{ {
const cv::cuda::GpuMat d_pts(pts.reshape(2, 1)); const cv::cuda::GpuMat d_pts(pts.reshape(2, 1));

View File

@ -48,6 +48,8 @@
#include "opencv2/core/cuda/limits.hpp" #include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/filters.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
using namespace cv::cuda; using namespace cv::cuda;
using namespace cv::cuda::device; using namespace cv::cuda::device;
@ -60,53 +62,240 @@ namespace pyrlk
__constant__ int c_halfWin_y; __constant__ int c_halfWin_y;
__constant__ int c_iters; __constant__ int c_iters;
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp); texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp); texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp); texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp); texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp); texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
template <int cn> struct Tex_I;
template <> struct Tex_I<1> template <int cn, typename T> struct Tex_I
{
static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T, cn>::vec_type> I)
{
(void)I;
}
};
template <> struct Tex_I<1, uchar>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_I8U, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& I)
{
bindTexture(&tex_I8U, I);
}
};
template <> struct Tex_I<1, ushort>
{
static __device__ __forceinline__ float read(float x, float y)
{
return 0.0;
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort>& I)
{
(void)I;
}
};
template <> struct Tex_I<1, int>
{
static __device__ __forceinline__ float read(float x, float y)
{
return 0.0;
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<int>& I)
{
(void)I;
}
};
template <> struct Tex_I<1, float>
{ {
static __device__ __forceinline__ float read(float x, float y) static __device__ __forceinline__ float read(float x, float y)
{ {
return tex2D(tex_If, x, y); return tex2D(tex_If, x, y);
} }
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& I)
{
bindTexture(&tex_If, I);
}
}; };
template <> struct Tex_I<4> // ****************** 3 channel specializations ************************
template <> struct Tex_I<3, uchar>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0,0,0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar3> I)
{
(void)I;
}
};
template <> struct Tex_I<3, ushort>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort3> I)
{
(void)I;
}
};
template <> struct Tex_I<3, int>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<int3> I)
{
(void)I;
}
};
template <> struct Tex_I<3, float>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float3> I)
{
(void)I;
}
};
// ****************** 4 channel specializations ************************
template <> struct Tex_I<4, uchar>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_I8UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& I)
{
bindTexture(&tex_I8UC4, I);
}
};
template <> struct Tex_I<4, ushort>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_I16UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& I)
{
bindTexture(&tex_I16UC4, I);
}
};
template <> struct Tex_I<4, float>
{ {
static __device__ __forceinline__ float4 read(float x, float y) static __device__ __forceinline__ float4 read(float x, float y)
{ {
return tex2D(tex_If4, x, y); return tex2D(tex_If4, x, y);
} }
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& I)
{
bindTexture(&tex_If4, I);
}
}; };
// ************* J ***************
template <int cn> struct Tex_J; template <int cn, typename T> struct Tex_J
template <> struct Tex_J<1> {
static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T,cn>::vec_type>& J)
{
(void)J;
}
};
template <> struct Tex_J<1, uchar>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_J8U, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& J)
{
bindTexture(&tex_J8U, J);
}
};
template <> struct Tex_J<1, float>
{ {
static __device__ __forceinline__ float read(float x, float y) static __device__ __forceinline__ float read(float x, float y)
{ {
return tex2D(tex_Jf, x, y); return tex2D(tex_Jf, x, y);
} }
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& J)
{
bindTexture(&tex_Jf, J);
}
}; };
template <> struct Tex_J<4> // ************* 4 channel specializations ***************
template <> struct Tex_J<4, uchar>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_J8UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& J)
{
bindTexture(&tex_J8UC4, J);
}
};
template <> struct Tex_J<4, ushort>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_J16UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& J)
{
bindTexture(&tex_J16UC4, J);
}
};
template <> struct Tex_J<4, float>
{ {
static __device__ __forceinline__ float4 read(float x, float y) static __device__ __forceinline__ float4 read(float x, float y)
{ {
return tex2D(tex_Jf4, x, y); return tex2D(tex_Jf4, x, y);
} }
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& J)
{
bindTexture(&tex_Jf4, J);
}
}; };
__device__ __forceinline__ void accum(float& dst, float val) __device__ __forceinline__ void accum(float& dst, const float& val)
{ {
dst += val; dst += val;
} }
__device__ __forceinline__ void accum(float& dst, const float4& val) __device__ __forceinline__ void accum(float& dst, const float2& val)
{
dst += val.x + val.y;
}
__device__ __forceinline__ void accum(float& dst, const float3& val)
{ {
dst += val.x + val.y + val.z; dst += val.x + val.y + val.z;
} }
__device__ __forceinline__ void accum(float& dst, const float4& val)
{
dst += val.x + val.y + val.z + val.w;
}
__device__ __forceinline__ float abs_(float a) __device__ __forceinline__ float abs_(float a)
{ {
@ -116,8 +305,46 @@ namespace pyrlk
{ {
return abs(a); return abs(a);
} }
__device__ __forceinline__ float2 abs_(const float2& a)
{
return abs(a);
}
__device__ __forceinline__ float3 abs_(const float3& a)
{
return abs(a);
}
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
template<typename T> __device__ __forceinline__ typename TypeVec<float, 1>::vec_type ToFloat(const typename TypeVec<T, 1>::vec_type& other)
{
return other;
}
template<typename T> __device__ __forceinline__ typename TypeVec<float, 2>::vec_type ToFloat(const typename TypeVec<T, 2>::vec_type& other)
{
typename TypeVec<float, 2>::vec_type ret;
ret.x = other.x;
ret.y = other.y;
return ret;
}
template<typename T> __device__ __forceinline__ typename TypeVec<float, 3>::vec_type ToFloat(const typename TypeVec<T, 3>::vec_type& other)
{
typename TypeVec<float, 3>::vec_type ret;
ret.x = other.x;
ret.y = other.y;
ret.z = other.z;
return ret;
}
template<typename T> __device__ __forceinline__ typename TypeVec<float, 4>::vec_type ToFloat(const typename TypeVec<T, 4>::vec_type& other)
{
typename TypeVec<float, 4>::vec_type ret;
ret.x = other.x;
ret.y = other.y;
ret.z = other.z;
ret.w = other.w;
return ret;
}
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T>
__global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{ {
#if __CUDA_ARCH__ <= 110 #if __CUDA_ARCH__ <= 110
@ -166,15 +393,15 @@ namespace pyrlk
float x = prevPt.x + xBase + 0.5f; float x = prevPt.x + xBase + 0.5f;
float y = prevPt.y + yBase + 0.5f; float y = prevPt.y + yBase + 0.5f;
I_patch[i][j] = Tex_I<cn>::read(x, y); I_patch[i][j] = Tex_I<cn, T>::read(x, y);
// Sharr Deriv // Sharr Deriv
work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) - work_type dIdx = 3.0f * Tex_I<cn,T>::read(x+1, y-1) + 10.0f * Tex_I<cn, T>::read(x+1, y) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1)); (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x-1, y) + 3.0f * Tex_I<cn,T>::read(x-1, y+1));
work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) - work_type dIdy = 3.0f * Tex_I<cn,T>::read(x-1, y+1) + 10.0f * Tex_I<cn, T>::read(x, y+1) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1)); (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x, y-1) + 3.0f * Tex_I<cn,T>::read(x+1, y-1));
dIdx_patch[i][j] = dIdx; dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy; dIdy_patch[i][j] = dIdy;
@ -243,7 +470,7 @@ namespace pyrlk
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{ {
work_type I_val = I_patch[i][j]; work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type diff = (J_val - I_val) * 32.0f; work_type diff = (J_val - I_val) * 32.0f;
@ -286,7 +513,7 @@ namespace pyrlk
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{ {
work_type I_val = I_patch[i][j]; work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type diff = J_val - I_val; work_type diff = J_val - I_val;
@ -309,22 +536,352 @@ namespace pyrlk
} }
} }
template <int cn, int PATCH_X, int PATCH_Y> // Kernel, uses non texture fetches
void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, template <int PATCH_X, int PATCH_Y, bool calcErr, int cn, typename T, typename Ptr2D>
__global__ void sparseKernel_(Ptr2D I, Ptr2D J, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{
#if __CUDA_ARCH__ <= 110
const int BLOCK_SIZE = 128;
#else
const int BLOCK_SIZE = 256;
#endif
__shared__ float smem1[BLOCK_SIZE];
__shared__ float smem2[BLOCK_SIZE];
__shared__ float smem3[BLOCK_SIZE];
const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
float2 prevPt = prevPts[blockIdx.x];
prevPt.x *= (1.0f / (1 << level));
prevPt.y *= (1.0f / (1 << level));
if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
prevPt.x -= c_halfWin_x;
prevPt.y -= c_halfWin_y;
// extract the patch from the first image, compute covariation matrix of derivatives
float A11 = 0;
float A12 = 0;
float A22 = 0;
typedef typename TypeVec<float, cn>::vec_type work_type;
work_type I_patch[PATCH_Y][PATCH_X];
work_type dIdx_patch[PATCH_Y][PATCH_X];
work_type dIdy_patch[PATCH_Y][PATCH_X];
for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
{
for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
{
float x = prevPt.x + xBase + 0.5f;
float y = prevPt.y + yBase + 0.5f;
I_patch[i][j] = ToFloat<T>(I(y, x));
// Sharr Deriv
work_type dIdx = 3.0f * I(y - 1, x + 1) + 10.0f * I(y, x + 1) + 3.0f * I(y + 1, x + 1) -
(3.0f * I(y - 1, x - 1) + 10.0f * I(y, x - 1) + 3.0f * I(y + 1 , x - 1));
work_type dIdy = 3.0f * I(y + 1, x - 1) + 10.0f * I(y + 1, x) + 3.0f * I(y+1, x + 1) -
(3.0f * I(y - 1, x - 1) + 10.0f * I(y-1, x) + 3.0f * I(y - 1, x + 1));
dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy;
accum(A11, dIdx * dIdx);
accum(A12, dIdx * dIdy);
accum(A22, dIdy * dIdy);
}
}
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
smem1[0] = A11;
smem2[0] = A12;
smem3[0] = A22;
}
#endif
__syncthreads();
A11 = smem1[0];
A12 = smem2[0];
A22 = smem3[0];
float D = A11 * A22 - A12 * A12;
if (D < numeric_limits<float>::epsilon())
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
D = 1.f / D;
A11 *= D;
A12 *= D;
A22 *= D;
float2 nextPt = nextPts[blockIdx.x];
nextPt.x *= 2.f;
nextPt.y *= 2.f;
nextPt.x -= c_halfWin_x;
nextPt.y -= c_halfWin_y;
for (int k = 0; k < c_iters; ++k)
{
if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
float b1 = 0;
float b2 = 0;
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = ToFloat<T>(J(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f));
work_type diff = (J_val - I_val) * 32.0f;
accum(b1, diff * dIdx_patch[i][j]);
accum(b2, diff * dIdy_patch[i][j]);
}
}
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
smem1[0] = b1;
smem2[0] = b2;
}
#endif
__syncthreads();
b1 = smem1[0];
b2 = smem2[0];
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
nextPt.x += delta.x;
nextPt.y += delta.y;
if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
break;
}
float errval = 0;
if (calcErr)
{
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = ToFloat<T>(J(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f));
work_type diff = J_val - I_val;
accum(errval, abs_(diff));
}
}
reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
}
if (tid == 0)
{
nextPt.x += c_halfWin_x;
nextPt.y += c_halfWin_y;
nextPts[blockIdx.x] = nextPt;
if (calcErr)
err[blockIdx.x] = static_cast<float>(errval) / (3 * c_winSize_x * c_winSize_y);
}
} // __global__ void sparseKernel_
template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller
{
public:
static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream) int level, dim3 block, cudaStream_t stream)
{ {
dim3 grid(ptcount); dim3 grid(ptcount);
(void)I;
(void)J;
if (level == 0 && err) if (level == 0 && err)
sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols); sparseKernel<cn, PATCH_X, PATCH_Y, true, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
else else
sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols); sparseKernel<cn, PATCH_X, PATCH_Y, false, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
cudaSafeCall(cudaGetLastError()); cudaSafeCall(cudaGetLastError());
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
};
// Specialization to use non texture path because for some reason the texture path keeps failing accuracy tests
template<int PATCH_X, int PATCH_Y> class sparse_caller<1, PATCH_X, PATCH_Y, unsigned short>
{
public:
typedef typename TypeVec<unsigned short, 1>::vec_type work_type;
typedef PtrStepSz<work_type> Ptr2D;
typedef BrdConstant<work_type> BrdType;
typedef BorderReader<Ptr2D, BrdType> Reader;
typedef LinearFilter<Reader> Filter;
static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
{
sparseKernel_<PATCH_X, PATCH_Y, true, 1, unsigned short> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
else
{
sparseKernel_<PATCH_X, PATCH_Y, false, 1, unsigned short> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
};
// Specialization for int because the texture path keeps failing
template<int PATCH_X, int PATCH_Y> class sparse_caller<1, PATCH_X, PATCH_Y, int>
{
public:
typedef typename TypeVec<int, 1>::vec_type work_type;
typedef PtrStepSz<work_type> Ptr2D;
typedef BrdConstant<work_type> BrdType;
typedef BorderReader<Ptr2D, BrdType> Reader;
typedef LinearFilter<Reader> Filter;
static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
{
sparseKernel_<PATCH_X, PATCH_Y, true, 1, int> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
else
{
sparseKernel_<PATCH_X, PATCH_Y, false, 1, int> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
};
template<int PATCH_X, int PATCH_Y> class sparse_caller<4, PATCH_X, PATCH_Y, int>
{
public:
typedef typename TypeVec<int, 4>::vec_type work_type;
typedef PtrStepSz<work_type> Ptr2D;
typedef BrdConstant<work_type> BrdType;
typedef BorderReader<Ptr2D, BrdType> Reader;
typedef LinearFilter<Reader> Filter;
static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
{
sparseKernel_<PATCH_X, PATCH_Y, true, 4, int> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
else
{
sparseKernel_<PATCH_X, PATCH_Y, false, 4, int> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
};
using namespace cv::cuda::device;
template <int PATCH_X, int PATCH_Y, typename T> class sparse_caller<3, PATCH_X, PATCH_Y, T>
{
public:
typedef typename TypeVec<T, 3>::vec_type work_type;
typedef PtrStepSz<work_type> Ptr2D;
typedef BrdConstant<work_type> BrdType;
typedef BorderReader<Ptr2D, BrdType> Reader;
typedef LinearFilter<Reader> Filter;
static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
{
sparseKernel_<PATCH_X, PATCH_Y, true, 3, T> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
else
{
sparseKernel_<PATCH_X, PATCH_Y, false, 3, T> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
};
template <bool calcErr> template <bool calcErr>
__global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
@ -484,57 +1041,36 @@ namespace pyrlk
cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
} }
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, template<typename T, int cn> struct pyrLK_caller
{
static void sparse(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream) int level, dim3 block, dim3 patch, cudaStream_t stream)
{ {
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, typedef void(*func_t)(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J,
int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream); int level, dim3 block, cudaStream_t stream);
static const func_t funcs[5][5] = static const func_t funcs[5][5] =
{ {
{sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>}, { sparse_caller<cn, 1, 1,T>::call, sparse_caller<cn, 2, 1,T>::call, sparse_caller<cn, 3, 1,T>::call, sparse_caller<cn, 4, 1,T>::call, sparse_caller<cn, 5, 1,T>::call },
{sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>}, { sparse_caller<cn, 1, 2,T>::call, sparse_caller<cn, 2, 2,T>::call, sparse_caller<cn, 3, 2,T>::call, sparse_caller<cn, 4, 2,T>::call, sparse_caller<cn, 5, 2,T>::call },
{sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>}, { sparse_caller<cn, 1, 3,T>::call, sparse_caller<cn, 2, 3,T>::call, sparse_caller<cn, 3, 3,T>::call, sparse_caller<cn, 4, 3,T>::call, sparse_caller<cn, 5, 3,T>::call },
{sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>}, { sparse_caller<cn, 1, 4,T>::call, sparse_caller<cn, 2, 4,T>::call, sparse_caller<cn, 3, 4,T>::call, sparse_caller<cn, 4, 4,T>::call, sparse_caller<cn, 5, 4,T>::call },
{sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>} { sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call }
}; };
bindTexture(&tex_If, I); Tex_I<cn, T>::bindTexture_(I);
bindTexture(&tex_Jf, J); Tex_J<cn, T>::bindTexture_(J);
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount, funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream); level, block, stream);
} }
static void dense(PtrStepSzb I, PtrStepSz<T> J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream)
{
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream);
static const func_t funcs[5][5] =
{
{sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
{sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
{sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
{sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
{sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
};
bindTexture(&tex_If4, I);
bindTexture(&tex_Jf4, J);
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
}
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
{ {
dim3 block(16, 16); dim3 block(16, 16);
dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y)); dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
Tex_I<1, uchar>::bindTexture_(I);
bindTexture(&tex_Ib, I); Tex_J<1, T>::bindTexture_(J);
bindTexture(&tex_Jf, J);
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
const int patchWidth = block.x + 2 * halfWin.x; const int patchWidth = block.x + 2 * halfWin.x;
@ -555,6 +1091,22 @@ namespace pyrlk
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
};
template class pyrLK_caller<unsigned char,1>;
template class pyrLK_caller<unsigned short,1>;
template class pyrLK_caller<int,1>;
template class pyrLK_caller<float,1>;
template class pyrLK_caller<unsigned char, 3>;
template class pyrLK_caller<unsigned short, 3>;
template class pyrLK_caller<int, 3>;
template class pyrLK_caller<float, 3>;
template class pyrLK_caller<unsigned char, 4>;
template class pyrLK_caller<unsigned short, 4>;
template class pyrLK_caller<int, 4>;
template class pyrLK_caller<float, 4>;
} }
#endif /* CUDA_DISABLER */ #endif /* CUDA_DISABLER */

View File

@ -52,7 +52,7 @@
#include "opencv2/video.hpp" #include "opencv2/video.hpp"
#include "opencv2/core/private.cuda.hpp" #include "opencv2/core/private.cuda.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/opencv_modules.hpp" #include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_CUDALEGACY #ifdef HAVE_OPENCV_CUDALEGACY

View File

@ -56,14 +56,20 @@ Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size, int, in
namespace pyrlk namespace pyrlk
{ {
void loadConstants(int2 winSize, int iters, cudaStream_t stream); void loadConstants(int2 winSize, int iters, cudaStream_t stream);
template<typename T, int cn> struct pyrLK_caller
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, {
int level, dim3 block, dim3 patch, cudaStream_t stream); static void sparse(PtrStepSz<typename device::TypeVec<T, cn>::vec_type> I, PtrStepSz<typename device::TypeVec<T, cn>::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream); int level, dim3 block, dim3 patch, cudaStream_t stream);
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, static void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
PtrStepSzf err, int2 winSize, cudaStream_t stream); PtrStepSzf err, int2 winSize, cudaStream_t stream);
};
template<typename T, int cn> void dispatcher(GpuMat I, GpuMat J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream)
{
pyrLK_caller<T, cn>::sparse(I, J, prevPts, nextPts, status, err, ptcount, level, block, patch, stream);
}
} }
namespace namespace
@ -76,6 +82,9 @@ namespace
void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, GpuMat* err, Stream& stream); GpuMat& status, GpuMat* err, Stream& stream);
void sparse(std::vector<GpuMat>& prevPyr, std::vector<GpuMat>& nextPyr, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, GpuMat* err, Stream& stream);
void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream); void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream);
protected: protected:
@ -83,8 +92,9 @@ namespace
int maxLevel_; int maxLevel_;
int iters_; int iters_;
bool useInitialFlow_; bool useInitialFlow_;
void buildImagePyramid(const GpuMat& prevImg, std::vector<GpuMat>& prevPyr, const GpuMat& nextImg, std::vector<GpuMat>& nextPyr, Stream stream);
private: private:
friend class SparsePyrLKOpticalFlowImpl;
std::vector<GpuMat> prevPyr_; std::vector<GpuMat> prevPyr_;
std::vector<GpuMat> nextPyr_; std::vector<GpuMat> nextPyr_;
}; };
@ -113,26 +123,32 @@ namespace
block.z = patch.z = 1; block.z = patch.z = 1;
} }
void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream) void PyrLKOpticalFlowBase::buildImagePyramid(const GpuMat& prevImg, std::vector<GpuMat>& prevPyr, const GpuMat& nextImg, std::vector<GpuMat>& nextPyr, Stream stream)
{ {
if (prevPts.empty()) prevPyr.resize(maxLevel_ + 1);
nextPyr.resize(maxLevel_ + 1);
int cn = prevImg.channels();
CV_Assert(cn == 1 || cn == 3 || cn == 4);
prevPyr[0] = prevImg;
nextPyr[0] = nextImg;
for (int level = 1; level <= maxLevel_; ++level)
{ {
nextPts.release(); cuda::pyrDown(prevPyr[level - 1], prevPyr[level], stream);
status.release(); cuda::pyrDown(nextPyr[level - 1], nextPyr[level], stream);
if (err) err->release();
return;
} }
}
dim3 block, patch; void PyrLKOpticalFlowBase::sparse(std::vector<GpuMat>& prevPyr, std::vector<GpuMat>& nextPyr, const GpuMat& prevPts, GpuMat& nextPts,
calcPatchSize(winSize_, block, patch); GpuMat& status, GpuMat* err, Stream& stream)
{
CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 ); CV_Assert(prevPyr.size() && nextPyr.size() && "Pyramid needs to at least contain the original matrix as the first element");
CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() ); CV_Assert(prevPyr[0].size() == nextPyr[0].size());
CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
CV_Assert(maxLevel_ >= 0); CV_Assert(maxLevel_ >= 0);
CV_Assert(winSize_.width > 2 && winSize_.height > 2); CV_Assert(winSize_.width > 2 && winSize_.height > 2);
CV_Assert( patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6 );
CV_Assert( prevPts.rows == 1 && prevPts.type() == CV_32FC2 );
if (useInitialFlow_) if (useInitialFlow_)
CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type()); CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type());
else else
@ -142,66 +158,70 @@ namespace
GpuMat temp2 = nextPts.reshape(1); GpuMat temp2 = nextPts.reshape(1);
cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream); cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream);
ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
status.setTo(Scalar::all(1), stream); status.setTo(Scalar::all(1), stream);
if (err) if (err)
ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
// build the image pyramids. if (prevPyr.size() != size_t(maxLevel_ + 1) || nextPyr.size() != size_t(maxLevel_ + 1))
BufferPool pool(stream);
prevPyr_.resize(maxLevel_ + 1);
nextPyr_.resize(maxLevel_ + 1);
int cn = prevImg.channels();
if (cn == 1 || cn == 4)
{ {
prevImg.convertTo(prevPyr_[0], CV_32F, stream); buildImagePyramid(prevPyr[0], prevPyr, nextPyr[0], nextPyr, stream);
nextImg.convertTo(nextPyr_[0], CV_32F, stream);
}
else
{
GpuMat buf = pool.getBuffer(prevImg.size(), CV_MAKE_TYPE(prevImg.depth(), 4));
cuda::cvtColor(prevImg, buf, COLOR_BGR2BGRA, 0, stream);
buf.convertTo(prevPyr_[0], CV_32F, stream);
cuda::cvtColor(nextImg, buf, COLOR_BGR2BGRA, 0, stream);
buf.convertTo(nextPyr_[0], CV_32F, stream);
}
for (int level = 1; level <= maxLevel_; ++level)
{
cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
} }
dim3 block, patch;
calcPatchSize(winSize_, block, patch);
CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream)); pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream));
const int cn = prevPyr[0].channels();
const int type = prevPyr[0].depth();
typedef void(*func_t)(GpuMat I, GpuMat J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream);
// Current int datatype is disabled due to pyrDown not implementing it
// while ushort does work, it has significantly worse performance, and thus doesn't pass accuracy tests.
static const func_t funcs[6][4] =
{
{ pyrlk::dispatcher<uchar, 1> , /*pyrlk::dispatcher<uchar, 2>*/ 0, pyrlk::dispatcher<uchar, 3> , pyrlk::dispatcher<uchar, 4> },
{ /*pyrlk::dispatcher<char, 1>*/ 0, /*pyrlk::dispatcher<char, 2>*/ 0, /*pyrlk::dispatcher<char, 3>*/ 0, /*pyrlk::dispatcher<char, 4>*/ 0 },
{ pyrlk::dispatcher<ushort, 1> , /*pyrlk::dispatcher<ushort, 2>*/0, pyrlk::dispatcher<ushort, 3> , pyrlk::dispatcher<ushort, 4> },
{ /*pyrlk::dispatcher<short, 1>*/ 0, /*pyrlk::dispatcher<short, 2>*/ 0, /*pyrlk::dispatcher<short, 3>*/ 0, /*pyrlk::dispatcher<short, 4>*/0 },
{ pyrlk::dispatcher<int, 1> , /*pyrlk::dispatcher<int, 2>*/ 0, pyrlk::dispatcher<int, 3> , pyrlk::dispatcher<int, 4> },
{ pyrlk::dispatcher<float, 1> , /*pyrlk::dispatcher<float, 2>*/ 0, pyrlk::dispatcher<float, 3> , pyrlk::dispatcher<float, 4> }
};
func_t func = funcs[type][cn-1];
CV_Assert(func != NULL && "Datatype not implemented");
for (int level = maxLevel_; level >= 0; level--) for (int level = maxLevel_; level >= 0; level--)
{ {
if (cn == 1) func(prevPyr[level], nextPyr[level],
{
pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(), prevPts.ptr<float2>(), nextPts.ptr<float2>(),
status.ptr(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0,
level == 0 && err ? err->ptr<float>() : 0, prevPts.cols, prevPts.cols, level, block, patch,
level, block, patch,
StreamAccessor::getStream(stream));
}
else
{
pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(),
status.ptr(),
level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch,
StreamAccessor::getStream(stream)); StreamAccessor::getStream(stream));
} }
} }
void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream)
{
if (prevPts.empty())
{
nextPts.release();
status.release();
if (err) err->release();
return;
}
CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 );
CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
// build the image pyramids.
buildImagePyramid(prevImg, prevPyr_, nextImg, nextPyr_, stream);
sparse(prevPyr_, nextPyr_, prevPts, nextPts, status, err, stream);
} }
void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream) void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream)
@ -250,7 +270,7 @@ namespace
{ {
int idx2 = (idx + 1) & 1; int idx2 = (idx + 1) & 1;
pyrlk::dense(prevPyr_[level], nextPyr_[level], pyrlk::pyrLK_caller<float,1>::dense(prevPyr_[level], nextPyr_[level],
uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2], uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2],
PtrStepSzf(), winSize2i, PtrStepSzf(), winSize2i,
StreamAccessor::getStream(stream)); StreamAccessor::getStream(stream));
@ -289,15 +309,24 @@ namespace
OutputArray _err, OutputArray _err,
Stream& stream) Stream& stream)
{ {
const GpuMat prevImg = _prevImg.getGpuMat();
const GpuMat nextImg = _nextImg.getGpuMat();
const GpuMat prevPts = _prevPts.getGpuMat(); const GpuMat prevPts = _prevPts.getGpuMat();
GpuMat& nextPts = _nextPts.getGpuMatRef(); GpuMat& nextPts = _nextPts.getGpuMatRef();
GpuMat& status = _status.getGpuMatRef(); GpuMat& status = _status.getGpuMatRef();
GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL; GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL;
if (_prevImg.kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT && _prevImg.kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT)
{
std::vector<GpuMat> prevPyr, nextPyr;
_prevImg.getGpuMatVector(prevPyr);
_nextImg.getGpuMatVector(nextPyr);
sparse(prevPyr, nextPyr, prevPts, nextPts, status, err, stream);
}
else
{
const GpuMat prevImg = _prevImg.getGpuMat();
const GpuMat nextImg = _nextImg.getGpuMat();
sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream); sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream);
} }
}
}; };
class DensePyrLKOpticalFlowImpl : public DensePyrLKOpticalFlow, private PyrLKOpticalFlowBase class DensePyrLKOpticalFlowImpl : public DensePyrLKOpticalFlow, private PyrLKOpticalFlowBase

View File

@ -167,33 +167,34 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, BroxOpticalFlow, ALL_DEVICES);
namespace namespace
{ {
IMPLEMENT_PARAM_CLASS(UseGray, bool) IMPLEMENT_PARAM_CLASS(Chan, int)
IMPLEMENT_PARAM_CLASS(DataType, int)
} }
PARAM_TEST_CASE(PyrLKOpticalFlow, cv::cuda::DeviceInfo, UseGray) PARAM_TEST_CASE(PyrLKOpticalFlow, cv::cuda::DeviceInfo, Chan, DataType)
{ {
cv::cuda::DeviceInfo devInfo; cv::cuda::DeviceInfo devInfo;
bool useGray; int channels;
int dataType;
virtual void SetUp() virtual void SetUp()
{ {
devInfo = GET_PARAM(0); devInfo = GET_PARAM(0);
useGray = GET_PARAM(1); channels = GET_PARAM(1);
dataType = GET_PARAM(2);
cv::cuda::setDevice(devInfo.deviceID()); cv::cuda::setDevice(devInfo.deviceID());
} }
}; };
CUDA_TEST_P(PyrLKOpticalFlow, Sparse) CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
{ {
cv::Mat frame0 = readImage("opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame0 = readImage("opticalflow/frame0.png", channels == 1 ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame0.empty()); ASSERT_FALSE(frame0.empty());
cv::Mat frame1 = readImage("opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame1 = readImage("opticalflow/frame1.png", channels == 1 ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame1.empty()); ASSERT_FALSE(frame1.empty());
cv::Mat gray_frame; cv::Mat gray_frame;
if (useGray) if (channels == 1)
gray_frame = frame0; gray_frame = frame0;
else else
cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY); cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
@ -208,9 +209,23 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> pyrLK = cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> pyrLK =
cv::cuda::SparsePyrLKOpticalFlow::create(); cv::cuda::SparsePyrLKOpticalFlow::create();
std::vector<cv::Point2f> nextPts_gold;
std::vector<unsigned char> status_gold;
cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
cv::cuda::GpuMat d_nextPts; cv::cuda::GpuMat d_nextPts;
cv::cuda::GpuMat d_status; cv::cuda::GpuMat d_status;
pyrLK->calc(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status); cv::Mat converted0, converted1;
if(channels == 4)
{
cv::cvtColor(frame0, frame0, cv::COLOR_BGR2BGRA);
cv::cvtColor(frame1, frame1, cv::COLOR_BGR2BGRA);
}
frame0.convertTo(converted0, dataType);
frame1.convertTo(converted1, dataType);
pyrLK->calc(loadMat(converted0), loadMat(converted1), d_pts, d_nextPts, d_status);
std::vector<cv::Point2f> nextPts(d_nextPts.cols); std::vector<cv::Point2f> nextPts(d_nextPts.cols);
cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*)&nextPts[0]); cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*)&nextPts[0]);
@ -220,10 +235,6 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]); cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]);
d_status.download(status_mat); d_status.download(status_mat);
std::vector<cv::Point2f> nextPts_gold;
std::vector<unsigned char> status_gold;
cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
ASSERT_EQ(nextPts_gold.size(), nextPts.size()); ASSERT_EQ(nextPts_gold.size(), nextPts.size());
ASSERT_EQ(status_gold.size(), status.size()); ASSERT_EQ(status_gold.size(), status.size());
@ -251,11 +262,16 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
double bad_ratio = static_cast<double>(mistmatch) / nextPts.size(); double bad_ratio = static_cast<double>(mistmatch) / nextPts.size();
ASSERT_LE(bad_ratio, 0.01); ASSERT_LE(bad_ratio, 0.01);
} }
INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, PyrLKOpticalFlow, testing::Combine( INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, PyrLKOpticalFlow, testing::Combine(
ALL_DEVICES, ALL_DEVICES,
testing::Values(UseGray(true), UseGray(false)))); testing::Values(Chan(1), Chan(3), Chan(4)),
testing::Values(DataType(CV_8U), DataType(CV_16U), DataType(CV_32S), DataType(CV_32F))));
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// FarnebackOpticalFlow // FarnebackOpticalFlow

View File

@ -212,10 +212,10 @@ namespace cv { namespace cuda { namespace device
template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

View File

@ -74,7 +74,7 @@ void cv::cuda::pyrDown(InputArray _src, OutputArray _dst, Stream& stream)
{0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/}, {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
{pyrDown_gpu<ushort> , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3> , pyrDown_gpu<ushort4> }, {pyrDown_gpu<ushort> , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3> , pyrDown_gpu<ushort4> },
{pyrDown_gpu<short> , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3> , pyrDown_gpu<short4> }, {pyrDown_gpu<short> , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3> , pyrDown_gpu<short4> },
{0 /*pyrDown_gpu<int>*/ , 0 /*pyrDown_gpu<int2>*/ , 0 /*pyrDown_gpu<int3>*/ , 0 /*pyrDown_gpu<int4>*/ }, {pyrDown_gpu<int> , 0 /*pyrDown_gpu<int2>*/ , pyrDown_gpu<int3> , pyrDown_gpu<int4> },
{pyrDown_gpu<float> , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3> , pyrDown_gpu<float4> } {pyrDown_gpu<float> , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3> , pyrDown_gpu<float4> }
}; };