merged 2.4 into trunk

This commit is contained in:
Vadim Pisarevsky
2012-04-30 14:33:52 +00:00
parent 3f1c6d7357
commit d5a0088bbe
194 changed files with 10158 additions and 8225 deletions

View File

@@ -433,6 +433,25 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
}
void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
{
int height = src.rows / 5;
int width = src.cols;
dim3 block(128);
dim3 grid(divUp(width, block.x), divUp(height, block.y));
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
__constant__ float c_gKer[MAX_KSIZE_HALF + 1];
template <typename Border>
@@ -575,14 +594,14 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
}
template <typename Border>
template <typename Border, int blockDimX>
void gaussianBlur5Caller(
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
{
int height = src.rows / 5;
int width = src.cols;
dim3 block(256);
dim3 block(blockDimX);
dim3 grid(divUp(width, block.x), divUp(height, block.y));
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
Border b(height, width);
@@ -603,12 +622,26 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
static const caller_t callers[] =
{
gaussianBlur5Caller<BrdReflect101<float> >,
gaussianBlur5Caller<BrdReplicate<float> >,
gaussianBlur5Caller<BrdReflect101<float>,256>,
gaussianBlur5Caller<BrdReplicate<float>,256>,
};
callers[borderMode](src, ksizeHalf, dst, stream);
}
}
void gaussianBlur5Gpu_CC11(
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
static const caller_t callers[] =
{
gaussianBlur5Caller<BrdReflect101<float>,128>,
gaussianBlur5Caller<BrdReplicate<float>,128>,
};
callers[borderMode](src, ksizeHalf, dst, stream);
}
}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback

View File

@@ -181,6 +181,7 @@ namespace cv { namespace gpu { namespace device
smem3[tid] = val3;
__syncthreads();
#if __CUDA_ARCH__ > 110
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
@@ -188,6 +189,7 @@ namespace cv { namespace gpu { namespace device
smem3[tid] = val3 += smem3[tid + 128];
}
__syncthreads();
#endif
if (tid < 64)
{
@@ -235,12 +237,14 @@ namespace cv { namespace gpu { namespace device
smem2[tid] = val2;
__syncthreads();
#if __CUDA_ARCH__ > 110
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
}
__syncthreads();
#endif
if (tid < 64)
{
@@ -279,11 +283,13 @@ namespace cv { namespace gpu { namespace device
smem1[tid] = val1;
__syncthreads();
#if __CUDA_ARCH__ > 110
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
}
__syncthreads();
#endif
if (tid < 64)
{
@@ -310,9 +316,15 @@ namespace cv { namespace gpu { namespace device
__global__ void lkSparse(const PtrStepb I, const PtrStepb J, const PtrStep<short> dIdx, const PtrStep<short> dIdy,
const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{
#if __CUDA_ARCH__ <= 110
__shared__ float smem1[128];
__shared__ float smem2[128];
__shared__ float smem3[128];
#else
__shared__ float smem1[256];
__shared__ float smem2[256];
__shared__ float smem3[256];
#endif
const int tid = threadIdx.y * blockDim.x + threadIdx.x;