added wrappers for BroxOpticalFlow and interpolateFrames

This commit is contained in:
Vladislav Vinogradov
2011-10-17 13:12:39 +00:00
parent 87f3451ec6
commit b0536279eb
6 changed files with 986 additions and 216 deletions

View File

@@ -65,76 +65,6 @@
#include "opencv2/gpu/device/utility.hpp"
////////////////////////////////////////////
template<typename _Tp> class shared_ptr
{
public:
shared_ptr() : obj(0), refcount(0) {}
shared_ptr(_Tp* _obj);
~shared_ptr() { release(); }
shared_ptr(const shared_ptr& ptr);
shared_ptr& operator = (const shared_ptr& ptr);
void addref() { if( refcount ) (*refcount)+=1; }
void release();
void delete_obj() { if( obj ) delete obj; }
_Tp* operator -> () { return obj; }
const _Tp* operator -> () const { return obj; }
operator _Tp* () { return obj; }
operator const _Tp*() const { return obj; }
protected:
_Tp* obj; //< the object pointer.
int* refcount; //< the associated reference counter
};
template<typename _Tp> inline shared_ptr<_Tp>::shared_ptr(_Tp* _obj) : obj(_obj)
{
if(obj)
{
refcount = new int;
*refcount = 1;
}
else
refcount = 0;
}
template<typename _Tp> inline void shared_ptr<_Tp>::release()
{
if( refcount)
{
*refcount -= 1;
if (*refcount == 0)
{
delete_obj();
delete refcount;
}
}
refcount = 0;
obj = 0;
}
template<typename _Tp> inline shared_ptr<_Tp>::shared_ptr(const shared_ptr<_Tp>& ptr)
{
obj = ptr.obj;
refcount = ptr.refcount;
addref();
}
template<typename _Tp> inline shared_ptr<_Tp>& shared_ptr<_Tp>::operator = (const shared_ptr<_Tp>& ptr)
{
int* _refcount = ptr.refcount;
if( _refcount )
*_refcount += 1;
release();
obj = ptr.obj;
refcount = _refcount;
return *this;
}
////////////////////////////////////////////
//using std::tr1::shared_ptr;
typedef NCVVectorAlloc<Ncv32f> FloatVector;
/////////////////////////////////////////////////////////////////////////////////////////
@@ -738,6 +668,42 @@ void InitTextures()
initTexture1D(tex_numerator_v);
}
namespace
{
struct ImagePyramid
{
std::vector<FloatVector*> img0;
std::vector<FloatVector*> img1;
std::vector<Ncv32u> w;
std::vector<Ncv32u> h;
explicit ImagePyramid(int outer_iterations)
{
img0.reserve(outer_iterations);
img1.reserve(outer_iterations);
w.reserve(outer_iterations);
h.reserve(outer_iterations);
}
~ImagePyramid()
{
w.clear();
h.clear();
for (int i = img0.size() - 1; i >= 0; --i)
{
delete img1[i];
delete img0[i];
}
img0.clear();
img1.clear();
}
};
}
/////////////////////////////////////////////////////////////////////////////////////////
// MAIN FUNCTION
/////////////////////////////////////////////////////////////////////////////////////////
@@ -759,21 +725,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
const Ncv32u kSourceHeight = frame0.height();
ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
bool kSkipProcessing = gpu_mem_allocator.isCounting();
cudaDeviceProp device_props;
int cuda_device;
ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR);
cudaDeviceProp device_props;
ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR);
Ncv32u alignmentValue = gpu_mem_allocator.alignment ();
const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float);
@@ -817,8 +781,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
// temporary storage
SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
alignUp(kSourceWidth, kStrideAlignmentFloat)
* alignUp(kSourceHeight, kStrideAlignmentFloat));
alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));
// image derivatives
SAFE_VECTOR_DECL(Ix, gpu_mem_allocator, kSizeInPixelsAligned);
@@ -831,35 +794,31 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
// spatial derivative filter size
const int kDFilterSize = 5;
const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize);
ncvAssertCUDAReturn(
cudaMemcpy(derivativeFilter.ptr(),
derivativeFilterHost,
sizeof(float) * kDFilterSize,
cudaMemcpyHostToDevice),
NCV_CUDA_ERROR);
if (!kSkipProcessing)
{
const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
InitTextures();
ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
InitTextures();
}
//prepare image pyramid
std::vector< shared_ptr<FloatVector> > img0_pyramid;
std::vector< shared_ptr<FloatVector> > img1_pyramid;
std::vector<Ncv32u> w_pyramid;
std::vector<Ncv32u> h_pyramid;
ImagePyramid pyr(desc.number_of_outer_iterations);
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
float scale = 1.0f;
//cuda arrays for frames
shared_ptr<FloatVector> I0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
ncvAssertReturn(I0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
std::auto_ptr<FloatVector> pI0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
ncvAssertReturn(pI0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
shared_ptr<FloatVector> I1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
ncvAssertReturn(I1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
std::auto_ptr<FloatVector> pI1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
ncvAssertReturn(pI1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
if (!kSkipProcessing)
{
@@ -867,25 +826,29 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
size_t src_width_in_bytes = kSourceWidth * sizeof(float);
size_t src_pitch_in_bytes = frame0.pitch();
ncvAssertCUDAReturn( cudaMemcpy2DAsync(I0->ptr(), dst_width_in_bytes, frame0.ptr(),
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn( cudaMemcpy2DAsync(I1->ptr(), dst_width_in_bytes, frame1.ptr(),
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
}
//prepare pyramid
img0_pyramid.push_back(I0);
img1_pyramid.push_back(I1);
FloatVector* I0 = pI0.release();
FloatVector* I1 = pI1.release();
w_pyramid.push_back(kSourceWidth);
h_pyramid.push_back(kSourceHeight);
//prepare pyramid
pyr.img0.push_back(I0);
pyr.img1.push_back(I1);
pyr.w.push_back(kSourceWidth);
pyr.h.push_back(kSourceHeight);
scale *= scale_factor;
Ncv32u prev_level_width = kSourceWidth;
Ncv32u prev_level_height = kSourceHeight;
while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(img0_pyramid.size()) < desc.number_of_outer_iterations))
while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(pyr.img0.size()) < desc.number_of_outer_iterations))
{
//current resolution
Ncv32u level_width = static_cast<Ncv32u>(ceilf(kSourceWidth * scale));
@@ -897,16 +860,16 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float);
shared_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
std::auto_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
shared_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
std::auto_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
if (!kSkipProcessing)
{
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
NcvSize32u srcSize (prev_level_width, prev_level_height);
NcvSize32u dstSize (level_width, level_height);
NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height);
@@ -921,20 +884,20 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample);
}
//store pointers
img0_pyramid.push_back(level_frame0);
img1_pyramid.push_back(level_frame1);
I0 = level_frame0.release();
I1 = level_frame1.release();
w_pyramid.push_back(level_width);
h_pyramid.push_back(level_height);
//store pointers
pyr.img0.push_back(I0);
pyr.img1.push_back(I1);
pyr.w.push_back(level_width);
pyr.h.push_back(level_height);
scale *= scale_factor;
prev_level_width = level_width;
prev_level_height = level_height;
I0 = level_frame0;
I1 = level_frame1;
}
if (!kSkipProcessing)
@@ -944,62 +907,56 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
//select images with lowest resolution
size_t pitch = alignUp(w_pyramid.back(), kStrideAlignmentFloat) * sizeof(float);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, img0_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, img1_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
}
FloatVector* ptrU = &u;
FloatVector* ptrV = &v;
FloatVector* ptrUNew = &u_new;
FloatVector* ptrVNew = &v_new;
FloatVector* ptrU = &u;
FloatVector* ptrV = &v;
FloatVector* ptrUNew = &u_new;
FloatVector* ptrVNew = &v_new;
std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img0Iter = img0_pyramid.rbegin();
std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img1Iter = img1_pyramid.rbegin();
//outer loop
//warping fixed point iteration
while(!w_pyramid.empty())
{
//current grid dimensions
const Ncv32u kLevelWidth = w_pyramid.back();
const Ncv32u kLevelHeight = h_pyramid.back();
const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
std::vector<FloatVector*>::const_reverse_iterator img0Iter = pyr.img0.rbegin();
std::vector<FloatVector*>::const_reverse_iterator img1Iter = pyr.img1.rbegin();
//size of current image in bytes
const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
//number of points at current resolution
const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
if (!kSkipProcessing)
//outer loop
//warping fixed point iteration
while(!pyr.w.empty())
{
//current grid dimensions
const Ncv32u kLevelWidth = pyr.w.back();
const Ncv32u kLevelHeight = pyr.h.back();
const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
//size of current image in bytes
const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
//number of points at current resolution
const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
//initial guess for du and dv
ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
}
//texture format descriptor
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
//texture format descriptor
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
I0 = *img0Iter;
I1 = *img1Iter;
I0 = *img0Iter;
I1 = *img1Iter;
++img0Iter;
++img1Iter;
++img0Iter;
++img1Iter;
if (!kSkipProcessing)
{
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
}
//compute derivatives
dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
dim3 dThreads(32, 6);
const int kPitchTex = kLevelStride * sizeof(float);
if (!kSkipProcessing)
{
//compute derivatives
dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
dim3 dThreads(32, 6);
const int kPitchTex = kLevelStride * sizeof(float);
NcvSize32u srcSize(kLevelWidth, kLevelHeight);
Ncv32u nSrcStep = kLevelStride * sizeof(float);
NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
@@ -1031,10 +988,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
// Ixy
nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
}
if (!kSkipProcessing)
{
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
@@ -1049,23 +1003,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
// flow increments
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
}
dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);
dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);
// inner loop
// lagged nonlinearity fixed point iteration
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
{
//compute coefficients
if (!kSkipProcessing)
// inner loop
// lagged nonlinearity fixed point iteration
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
{
//compute coefficients
prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
(diffusivity_x.ptr(),
diffusivity_y.ptr(),
@@ -1101,13 +1051,12 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
}
//solve linear system
for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
{
float omega = 1.99f;
if (!kSkipProcessing)
//solve linear system
for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
{
float omega = 1.99f;
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
@@ -1139,33 +1088,29 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
kLevelWidth,
kLevelHeight,
kLevelStride);
}
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
}//end of solver loop
}// end of inner loop
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
}//end of solver loop
}// end of inner loop
//update u and v
if (!kSkipProcessing)
{
//update u and v
add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream);
add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream);
}
//prolongate using texture
w_pyramid.pop_back();
h_pyramid.pop_back();
if (!w_pyramid.empty())
{
//compute new image size
Ncv32u nw = w_pyramid.back();
Ncv32u nh = h_pyramid.back();
Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);
dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
dim3 p_threads(32, 8);
if (!kSkipProcessing)
//prolongate using texture
pyr.w.pop_back();
pyr.h.pop_back();
if (!pyr.w.empty())
{
//compute new image size
Ncv32u nw = pyr.w.back();
Ncv32u nh = pyr.h.back();
Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);
dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
dim3 p_threads(32, 8);
NcvSize32u srcSize (kLevelWidth, kLevelHeight);
NcvSize32u dstSize (nw, nh);
NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
@@ -1180,27 +1125,27 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic);
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
cv::gpu::device::swap<FloatVector*>(ptrU, ptrUNew);
cv::gpu::device::swap<FloatVector*>(ptrV, ptrVNew);
}
cv::gpu::device::swap<FloatVector*>(ptrU, ptrUNew);
cv::gpu::device::swap<FloatVector*>(ptrV, ptrVNew);
scale /= scale_factor;
}
scale /= scale_factor;
// end of warping iterations
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn( cudaMemcpy2DAsync
(uOut.ptr(), uOut.pitch(), ptrU->ptr(),
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn( cudaMemcpy2DAsync
(vOut.ptr(), vOut.pitch(), ptrV->ptr(),
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
}
// end of warping iterations
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn( cudaMemcpy2DAsync
(uOut.ptr(), uOut.pitch(), ptrU->ptr(),
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn( cudaMemcpy2DAsync
(vOut.ptr(), vOut.pitch(), ptrV->ptr(),
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
return NCV_SUCCESS;
}

View File

@@ -0,0 +1,198 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
using namespace std;
#if !defined (HAVE_CUDA)
void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::interpolateFrames(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
#else
namespace
{
size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
{
NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
CV_Assert(gpuCounter.isInitialized());
NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0);
CV_Assert(ncvStat == NCV_SUCCESS);
return gpuCounter.maxSize();
}
}
namespace
{
void outputHandler(const char* msg)
{
CV_Error(CV_GpuApiCallError, msg);
}
}
void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
{
ncvSetDebugOutputHandler(outputHandler);
CV_Assert(frame0.type() == CV_32FC1);
CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
u.create(frame0.size(), CV_32FC1);
v.create(frame0.size(), CV_32FC1);
cudaDeviceProp devProp;
cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
NCVBroxOpticalFlowDescriptor desc;
desc.alpha = alpha;
desc.gamma = gamma;
desc.scale_factor = scale_factor;
desc.number_of_inner_iterations = inner_iterations;
desc.number_of_outer_iterations = outer_iterations;
desc.number_of_solver_iterations = solver_iterations;
NCVMemSegment frame0MemSeg;
frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
frame0MemSeg.size = frame0.step * frame0.rows;
NCVMemSegment frame1MemSeg;
frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
frame1MemSeg.size = frame1.step * frame1.rows;
NCVMemSegment uMemSeg;
uMemSeg.begin.memtype = NCVMemoryTypeDevice;
uMemSeg.begin.ptr = u.ptr();
uMemSeg.size = u.step * u.rows;
NCVMemSegment vMemSeg;
vMemSeg.begin.memtype = NCVMemoryTypeDevice;
vMemSeg.begin.ptr = v.ptr();
vMemSeg.size = v.step * v.rows;
NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, devProp.textureAlignment, frame0.cols, frame0.rows, frame0.step);
NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, devProp.textureAlignment, frame1.cols, frame1.rows, frame1.step);
NCVMatrixReuse<Ncv32f> uMat(uMemSeg, devProp.textureAlignment, u.cols, u.rows, u.step);
NCVMatrixReuse<Ncv32f> vMat(vMemSeg, devProp.textureAlignment, v.cols, v.rows, v.step);
cudaStream_t stream = StreamAccessor::getStream(s);
size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
CV_Assert(gpuAllocator.isInitialized());
NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream);
CV_Assert(ncvStat == NCV_SUCCESS);
}
void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv,
float pos, GpuMat& newFrame, GpuMat& buf, Stream& s)
{
CV_Assert(frame0.type() == CV_32FC1);
CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
CV_Assert(fu.size() == frame0.size() && fu.type() == frame0.type());
CV_Assert(fv.size() == frame0.size() && fv.type() == frame0.type());
CV_Assert(bu.size() == frame0.size() && bu.type() == frame0.type());
CV_Assert(bv.size() == frame0.size() && bv.type() == frame0.type());
newFrame.create(frame0.size(), frame0.type());
buf.create(6 * frame0.rows, frame0.cols, CV_32FC1);
buf.setTo(Scalar::all(0));
// occlusion masks
GpuMat occ0 = buf.rowRange(0 * frame0.rows, 1 * frame0.rows);
GpuMat occ1 = buf.rowRange(1 * frame0.rows, 2 * frame0.rows);
// interpolated forward flow
GpuMat fui = buf.rowRange(2 * frame0.rows, 3 * frame0.rows);
GpuMat fvi = buf.rowRange(3 * frame0.rows, 4 * frame0.rows);
// interpolated backward flow
GpuMat bui = buf.rowRange(4 * frame0.rows, 5 * frame0.rows);
GpuMat bvi = buf.rowRange(5 * frame0.rows, 6 * frame0.rows);
size_t step = frame0.step;
CV_Assert(frame1.step == step && fu.step == step && fv.step == step && bu.step == step && bv.step == step && newFrame.step == step && buf.step == step);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStStreamHandler h(stream);
NppStInterpolationState state;
state.size = NcvSize32u(frame0.cols, frame0.rows);
state.nStep = static_cast<Ncv32u>(step);
state.pSrcFrame0 = const_cast<Ncv32f*>(frame0.ptr<Ncv32f>());
state.pSrcFrame1 = const_cast<Ncv32f*>(frame1.ptr<Ncv32f>());
state.pFU = const_cast<Ncv32f*>(fu.ptr<Ncv32f>());
state.pFV = const_cast<Ncv32f*>(fv.ptr<Ncv32f>());
state.pBU = const_cast<Ncv32f*>(bu.ptr<Ncv32f>());
state.pBV = const_cast<Ncv32f*>(bv.ptr<Ncv32f>());
state.pos = pos;
state.pNewFrame = newFrame.ptr<Ncv32f>();
state.ppBuffers[0] = occ0.ptr<Ncv32f>();
state.ppBuffers[1] = occ1.ptr<Ncv32f>();
state.ppBuffers[2] = fui.ptr<Ncv32f>();
state.ppBuffers[3] = fvi.ptr<Ncv32f>();
state.ppBuffers[4] = bui.ptr<Ncv32f>();
state.ppBuffers[5] = bvi.ptr<Ncv32f>();
nppSafeCall( nppiStInterpolateFrames(&state) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
#endif /* HAVE_CUDA */

View File

@@ -76,6 +76,7 @@
#include "nvidia/core/NCV.hpp"
#include "nvidia/NPP_staging/NPP_staging.hpp"
#include "nvidia/NCVHaarObjectDetection.hpp"
#include "nvidia/NCVBroxOpticalFlow.hpp"
#define CUDART_MINIMUM_REQUIRED_VERSION 4000
#define NPP_MINIMUM_REQUIRED_VERSION 4000