Fixed bug #1759
This commit is contained in:
@@ -1750,6 +1750,7 @@ public:
|
|||||||
useInitialFlow = false;
|
useInitialFlow = false;
|
||||||
minEigThreshold = 1e-4f;
|
minEigThreshold = 1e-4f;
|
||||||
getMinEigenVals = false;
|
getMinEigenVals = false;
|
||||||
|
isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
|
||||||
}
|
}
|
||||||
|
|
||||||
void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
|
void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
|
||||||
@@ -1796,6 +1797,8 @@ private:
|
|||||||
|
|
||||||
vector<GpuMat> uPyr_;
|
vector<GpuMat> uPyr_;
|
||||||
vector<GpuMat> vPyr_;
|
vector<GpuMat> vPyr_;
|
||||||
|
|
||||||
|
bool isDeviceArch11_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -1812,6 +1815,7 @@ public:
|
|||||||
polyN = 5;
|
polyN = 5;
|
||||||
polySigma = 1.1;
|
polySigma = 1.1;
|
||||||
flags = 0;
|
flags = 0;
|
||||||
|
isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
|
||||||
}
|
}
|
||||||
|
|
||||||
int numLevels;
|
int numLevels;
|
||||||
@@ -1859,6 +1863,8 @@ private:
|
|||||||
GpuMat frames_[2];
|
GpuMat frames_[2];
|
||||||
GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
|
GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
|
||||||
std::vector<GpuMat> pyramid0_, pyramid1_;
|
std::vector<GpuMat> pyramid0_, pyramid1_;
|
||||||
|
|
||||||
|
bool isDeviceArch11_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@@ -433,6 +433,25 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
int height = src.rows / 5;
|
||||||
|
int width = src.cols;
|
||||||
|
|
||||||
|
dim3 block(128);
|
||||||
|
dim3 grid(divUp(width, block.x), divUp(height, block.y));
|
||||||
|
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
|
||||||
|
|
||||||
|
float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
|
||||||
|
boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
|
||||||
|
|
||||||
|
cudaSafeCall(cudaGetLastError());
|
||||||
|
|
||||||
|
if (stream == 0)
|
||||||
|
cudaSafeCall(cudaDeviceSynchronize());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
__constant__ float c_gKer[MAX_KSIZE_HALF + 1];
|
__constant__ float c_gKer[MAX_KSIZE_HALF + 1];
|
||||||
|
|
||||||
template <typename Border>
|
template <typename Border>
|
||||||
@@ -575,14 +594,14 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename Border>
|
template <typename Border, int blockDimX>
|
||||||
void gaussianBlur5Caller(
|
void gaussianBlur5Caller(
|
||||||
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
|
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
int height = src.rows / 5;
|
int height = src.rows / 5;
|
||||||
int width = src.cols;
|
int width = src.cols;
|
||||||
|
|
||||||
dim3 block(256);
|
dim3 block(blockDimX);
|
||||||
dim3 grid(divUp(width, block.x), divUp(height, block.y));
|
dim3 grid(divUp(width, block.x), divUp(height, block.y));
|
||||||
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
|
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
|
||||||
Border b(height, width);
|
Border b(height, width);
|
||||||
@@ -603,8 +622,22 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
|||||||
|
|
||||||
static const caller_t callers[] =
|
static const caller_t callers[] =
|
||||||
{
|
{
|
||||||
gaussianBlur5Caller<BrdReflect101<float> >,
|
gaussianBlur5Caller<BrdReflect101<float>,256>,
|
||||||
gaussianBlur5Caller<BrdReplicate<float> >,
|
gaussianBlur5Caller<BrdReplicate<float>,256>,
|
||||||
|
};
|
||||||
|
|
||||||
|
callers[borderMode](src, ksizeHalf, dst, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gaussianBlur5Gpu_CC11(
|
||||||
|
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
|
||||||
|
|
||||||
|
static const caller_t callers[] =
|
||||||
|
{
|
||||||
|
gaussianBlur5Caller<BrdReflect101<float>,128>,
|
||||||
|
gaussianBlur5Caller<BrdReplicate<float>,128>,
|
||||||
};
|
};
|
||||||
|
|
||||||
callers[borderMode](src, ksizeHalf, dst, stream);
|
callers[borderMode](src, ksizeHalf, dst, stream);
|
||||||
|
@@ -181,6 +181,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
smem3[tid] = val3;
|
smem3[tid] = val3;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ > 110
|
||||||
if (tid < 128)
|
if (tid < 128)
|
||||||
{
|
{
|
||||||
smem1[tid] = val1 += smem1[tid + 128];
|
smem1[tid] = val1 += smem1[tid + 128];
|
||||||
@@ -188,6 +189,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
smem3[tid] = val3 += smem3[tid + 128];
|
smem3[tid] = val3 += smem3[tid + 128];
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
#endif
|
||||||
|
|
||||||
if (tid < 64)
|
if (tid < 64)
|
||||||
{
|
{
|
||||||
@@ -235,12 +237,14 @@ namespace cv { namespace gpu { namespace device
|
|||||||
smem2[tid] = val2;
|
smem2[tid] = val2;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ > 110
|
||||||
if (tid < 128)
|
if (tid < 128)
|
||||||
{
|
{
|
||||||
smem1[tid] = val1 += smem1[tid + 128];
|
smem1[tid] = val1 += smem1[tid + 128];
|
||||||
smem2[tid] = val2 += smem2[tid + 128];
|
smem2[tid] = val2 += smem2[tid + 128];
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
#endif
|
||||||
|
|
||||||
if (tid < 64)
|
if (tid < 64)
|
||||||
{
|
{
|
||||||
@@ -279,11 +283,13 @@ namespace cv { namespace gpu { namespace device
|
|||||||
smem1[tid] = val1;
|
smem1[tid] = val1;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ > 110
|
||||||
if (tid < 128)
|
if (tid < 128)
|
||||||
{
|
{
|
||||||
smem1[tid] = val1 += smem1[tid + 128];
|
smem1[tid] = val1 += smem1[tid + 128];
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
#endif
|
||||||
|
|
||||||
if (tid < 64)
|
if (tid < 64)
|
||||||
{
|
{
|
||||||
@@ -310,9 +316,15 @@ namespace cv { namespace gpu { namespace device
|
|||||||
__global__ void lkSparse(const PtrStepb I, const PtrStepb J, const PtrStep<short> dIdx, const PtrStep<short> dIdy,
|
__global__ void lkSparse(const PtrStepb I, const PtrStepb J, const PtrStep<short> dIdx, const PtrStep<short> dIdy,
|
||||||
const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
|
const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
|
||||||
{
|
{
|
||||||
|
#if __CUDA_ARCH__ <= 110
|
||||||
|
__shared__ float smem1[128];
|
||||||
|
__shared__ float smem2[128];
|
||||||
|
__shared__ float smem3[128];
|
||||||
|
#else
|
||||||
__shared__ float smem1[256];
|
__shared__ float smem1[256];
|
||||||
__shared__ float smem2[256];
|
__shared__ float smem2[256];
|
||||||
__shared__ float smem3[256];
|
__shared__ float smem3[256];
|
||||||
|
#endif
|
||||||
|
|
||||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
@@ -81,6 +81,8 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
|||||||
|
|
||||||
void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
|
void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
|
||||||
|
|
||||||
|
void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
|
||||||
|
|
||||||
void setGaussianBlurKernel(const float *gKer, int ksizeHalf);
|
void setGaussianBlurKernel(const float *gKer, int ksizeHalf);
|
||||||
|
|
||||||
void gaussianBlurGpu(
|
void gaussianBlurGpu(
|
||||||
@@ -89,6 +91,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
|||||||
void gaussianBlur5Gpu(
|
void gaussianBlur5Gpu(
|
||||||
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
|
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
|
||||||
|
|
||||||
|
void gaussianBlur5Gpu_CC11(
|
||||||
|
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
|
||||||
|
|
||||||
}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||||
|
|
||||||
|
|
||||||
@@ -167,7 +172,10 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter(
|
|||||||
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
|
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
|
||||||
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
|
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
|
||||||
{
|
{
|
||||||
device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
|
if (!isDeviceArch11_)
|
||||||
|
device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
|
||||||
|
else
|
||||||
|
device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
|
||||||
swap(M, bufM);
|
swap(M, bufM);
|
||||||
|
|
||||||
for (int i = 1; i < 5; ++i)
|
for (int i = 1; i < 5; ++i)
|
||||||
@@ -183,8 +191,12 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
|
|||||||
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
|
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
|
||||||
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
|
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
|
||||||
{
|
{
|
||||||
device::optflow_farneback::gaussianBlur5Gpu(
|
if (!isDeviceArch11_)
|
||||||
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
|
device::optflow_farneback::gaussianBlur5Gpu(
|
||||||
|
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
|
||||||
|
else
|
||||||
|
device::optflow_farneback::gaussianBlur5Gpu_CC11(
|
||||||
|
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
|
||||||
swap(M, bufM);
|
swap(M, bufM);
|
||||||
|
|
||||||
device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
|
device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
|
||||||
|
@@ -126,18 +126,19 @@ void cv::gpu::PyrLKOpticalFlow::buildImagePyramid(const GpuMat& img0, vector<Gpu
|
|||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
void calcPatchSize(cv::Size winSize, int cn, dim3& block, dim3& patch)
|
void calcPatchSize(cv::Size winSize, int cn, dim3& block, dim3& patch, bool isDeviceArch11)
|
||||||
{
|
{
|
||||||
winSize.width *= cn;
|
winSize.width *= cn;
|
||||||
|
|
||||||
if (winSize.width > 32 && winSize.width > 2 * winSize.height)
|
if (winSize.width > 32 && winSize.width > 2 * winSize.height)
|
||||||
{
|
{
|
||||||
block.x = 32;
|
block.x = isDeviceArch11 ? 16 : 32;
|
||||||
block.y = 8;
|
block.y = 8;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
block.x = block.y = 16;
|
block.x = 16;
|
||||||
|
block.y = isDeviceArch11 ? 8 : 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
patch.x = (winSize.width + block.x - 1) / block.x;
|
patch.x = (winSize.width + block.x - 1) / block.x;
|
||||||
@@ -166,7 +167,7 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
|
|||||||
const int cn = prevImg.channels();
|
const int cn = prevImg.channels();
|
||||||
|
|
||||||
dim3 block, patch;
|
dim3 block, patch;
|
||||||
calcPatchSize(winSize, cn, block, patch);
|
calcPatchSize(winSize, cn, block, patch, isDeviceArch11_);
|
||||||
|
|
||||||
CV_Assert(derivLambda >= 0);
|
CV_Assert(derivLambda >= 0);
|
||||||
CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);
|
CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);
|
||||||
|
Reference in New Issue
Block a user