merged 2.4 into trunk
This commit is contained in:
@@ -433,6 +433,25 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||
}
|
||||
|
||||
|
||||
void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
|
||||
{
|
||||
int height = src.rows / 5;
|
||||
int width = src.cols;
|
||||
|
||||
dim3 block(128);
|
||||
dim3 grid(divUp(width, block.x), divUp(height, block.y));
|
||||
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
|
||||
|
||||
float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
|
||||
boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
|
||||
|
||||
cudaSafeCall(cudaGetLastError());
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
__constant__ float c_gKer[MAX_KSIZE_HALF + 1];
|
||||
|
||||
template <typename Border>
|
||||
@@ -575,14 +594,14 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||
}
|
||||
|
||||
|
||||
template <typename Border>
|
||||
template <typename Border, int blockDimX>
|
||||
void gaussianBlur5Caller(
|
||||
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
|
||||
{
|
||||
int height = src.rows / 5;
|
||||
int width = src.cols;
|
||||
|
||||
dim3 block(256);
|
||||
dim3 block(blockDimX);
|
||||
dim3 grid(divUp(width, block.x), divUp(height, block.y));
|
||||
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
|
||||
Border b(height, width);
|
||||
@@ -603,12 +622,26 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
gaussianBlur5Caller<BrdReflect101<float> >,
|
||||
gaussianBlur5Caller<BrdReplicate<float> >,
|
||||
gaussianBlur5Caller<BrdReflect101<float>,256>,
|
||||
gaussianBlur5Caller<BrdReplicate<float>,256>,
|
||||
};
|
||||
|
||||
callers[borderMode](src, ksizeHalf, dst, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void gaussianBlur5Gpu_CC11(
|
||||
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
|
||||
|
||||
static const caller_t callers[] =
|
||||
{
|
||||
gaussianBlur5Caller<BrdReflect101<float>,128>,
|
||||
gaussianBlur5Caller<BrdReplicate<float>,128>,
|
||||
};
|
||||
|
||||
callers[borderMode](src, ksizeHalf, dst, stream);
|
||||
}
|
||||
|
||||
}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||
|
||||
|
@@ -181,6 +181,7 @@ namespace cv { namespace gpu { namespace device
|
||||
smem3[tid] = val3;
|
||||
__syncthreads();
|
||||
|
||||
#if __CUDA_ARCH__ > 110
|
||||
if (tid < 128)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 128];
|
||||
@@ -188,6 +189,7 @@ namespace cv { namespace gpu { namespace device
|
||||
smem3[tid] = val3 += smem3[tid + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
if (tid < 64)
|
||||
{
|
||||
@@ -235,12 +237,14 @@ namespace cv { namespace gpu { namespace device
|
||||
smem2[tid] = val2;
|
||||
__syncthreads();
|
||||
|
||||
#if __CUDA_ARCH__ > 110
|
||||
if (tid < 128)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 128];
|
||||
smem2[tid] = val2 += smem2[tid + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
if (tid < 64)
|
||||
{
|
||||
@@ -279,11 +283,13 @@ namespace cv { namespace gpu { namespace device
|
||||
smem1[tid] = val1;
|
||||
__syncthreads();
|
||||
|
||||
#if __CUDA_ARCH__ > 110
|
||||
if (tid < 128)
|
||||
{
|
||||
smem1[tid] = val1 += smem1[tid + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
if (tid < 64)
|
||||
{
|
||||
@@ -310,9 +316,15 @@ namespace cv { namespace gpu { namespace device
|
||||
__global__ void lkSparse(const PtrStepb I, const PtrStepb J, const PtrStep<short> dIdx, const PtrStep<short> dIdy,
|
||||
const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
|
||||
{
|
||||
#if __CUDA_ARCH__ <= 110
|
||||
__shared__ float smem1[128];
|
||||
__shared__ float smem2[128];
|
||||
__shared__ float smem3[128];
|
||||
#else
|
||||
__shared__ float smem1[256];
|
||||
__shared__ float smem2[256];
|
||||
__shared__ float smem3[256];
|
||||
#endif
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
|
@@ -172,11 +172,11 @@ static void add(float *res, const float *rhs, const int count, cudaStream_t stre
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
__global__ void scaleVector(float *d_res, const float *d_src, float scale, const int len)
|
||||
{
|
||||
const int pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (pos >= len) return;
|
||||
|
||||
d_res[pos] = d_src[pos] * scale;
|
||||
const int pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (pos >= len) return;
|
||||
|
||||
d_res[pos] = d_src[pos] * scale;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -191,10 +191,10 @@ __global__ void scaleVector(float *d_res, const float *d_src, float scale, const
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
static void ScaleVector(float *d_res, const float *d_src, float scale, const int len, cudaStream_t stream)
|
||||
{
|
||||
dim3 threads(256);
|
||||
dim3 blocks(iDivUp(len, threads.x));
|
||||
|
||||
scaleVector<<<blocks, threads, 0, stream>>>(d_res, d_src, scale, len);
|
||||
dim3 threads(256);
|
||||
dim3 blocks(iDivUp(len, threads.x));
|
||||
|
||||
scaleVector<<<blocks, threads, 0, stream>>>(d_res, d_src, scale, len);
|
||||
}
|
||||
|
||||
const int SOR_TILE_WIDTH = 32;
|
||||
@@ -1128,14 +1128,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
||||
|
||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
|
||||
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
|
||||
|
||||
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
||||
|
||||
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
||||
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
||||
|
||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
|
||||
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
|
||||
|
||||
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
||||
|
||||
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
||||
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
||||
|
||||
cv::gpu::device::swap<FloatVector*>(ptrU, ptrUNew);
|
||||
|
@@ -2508,7 +2508,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
|
||||
wsum += wx;
|
||||
}
|
||||
}
|
||||
dst[(ix + dstROI.x)+ (iy + dstROI.y) * dstStep] = sum / wsum;
|
||||
dst[(ix + dstROI.x)+ (iy + dstROI.y) * dstStep] = (!wsum)? 0 : sum / wsum;
|
||||
}
|
||||
|
||||
|
||||
|
@@ -81,6 +81,8 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||
|
||||
void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
|
||||
|
||||
void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
|
||||
|
||||
void setGaussianBlurKernel(const float *gKer, int ksizeHalf);
|
||||
|
||||
void gaussianBlurGpu(
|
||||
@@ -89,6 +91,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||
void gaussianBlur5Gpu(
|
||||
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
|
||||
|
||||
void gaussianBlur5Gpu_CC11(
|
||||
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
|
||||
|
||||
}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||
|
||||
|
||||
@@ -167,7 +172,10 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter(
|
||||
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
|
||||
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
|
||||
{
|
||||
device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
|
||||
if (!isDeviceArch11_)
|
||||
device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
|
||||
else
|
||||
device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
|
||||
swap(M, bufM);
|
||||
|
||||
for (int i = 1; i < 5; ++i)
|
||||
@@ -183,8 +191,12 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
|
||||
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
|
||||
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
|
||||
{
|
||||
device::optflow_farneback::gaussianBlur5Gpu(
|
||||
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
|
||||
if (!isDeviceArch11_)
|
||||
device::optflow_farneback::gaussianBlur5Gpu(
|
||||
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
|
||||
else
|
||||
device::optflow_farneback::gaussianBlur5Gpu_CC11(
|
||||
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
|
||||
swap(M, bufM);
|
||||
|
||||
device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
|
||||
|
@@ -622,6 +622,9 @@ void cv::gpu::ORB_GPU::computeDescriptors(GpuMat& descriptors)
|
||||
if (keyPointsCount_[level] == 0)
|
||||
continue;
|
||||
|
||||
if (keyPointsCount_[level] == 0)
|
||||
continue;
|
||||
|
||||
GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
|
||||
|
||||
if (blurForDescriptor)
|
||||
|
Reference in New Issue
Block a user