renamed gpuoptflow -> cudaoptflow

This commit is contained in:
Vladislav Vinogradov
2013-07-23 17:04:38 +04:00
parent a0ae602bb7
commit 5660d6a680
47 changed files with 45 additions and 45 deletions

View File

@@ -0,0 +1,169 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace optflowbm
{
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
__device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
{
int s = 0;
for (int y = 0; y < blockSize.y; ++y)
{
for (int x = 0; x < blockSize.x; ++x)
s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
}
return s;
}
__global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
const short2* ss, const int ssCount)
{
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= velx.rows || j >= velx.cols)
return;
const int X1 = j * shiftSize.x;
const int Y1 = i * shiftSize.y;
const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0;
const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0;
int X2 = X1 + offX;
int Y2 = Y1 + offY;
int dist = numeric_limits<int>::max();
if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
int countMin = 1;
int sumx = offX;
int sumy = offY;
if (dist > acceptLevel)
{
// do brute-force search
for (int k = 0; k < ssCount; ++k)
{
const short2 ssVal = ss[k];
const int dx = offX + ssVal.x;
const int dy = offY + ssVal.y;
X2 = X1 + dx;
Y2 = Y1 + dy;
if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
{
const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
if (tmpDist < acceptLevel)
{
sumx = dx;
sumy = dy;
countMin = 1;
break;
}
if (tmpDist < dist)
{
dist = tmpDist;
sumx = dx;
sumy = dy;
countMin = 1;
}
else if (tmpDist == dist)
{
sumx += dx;
sumy += dy;
countMin++;
}
}
}
if (dist > escapeLevel)
{
sumx = offX;
sumy = offY;
countMin = 1;
}
}
velx(i, j) = static_cast<float>(sumx) / countMin;
vely(i, j) = static_cast<float>(sumy) / countMin;
}
void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
{
bindTexture(&tex_prev, prev);
bindTexture(&tex_curr, curr);
const dim3 block(32, 8);
const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
maxX, maxY, acceptLevel, escapeLevel, ss, ssCount);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
#endif // !defined CUDA_DISABLER

View File

@@ -0,0 +1,295 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace optflowbm_fast
{
enum
{
CTA_SIZE = 128,
TILE_COLS = 128,
TILE_ROWS = 32,
STRIDE = CTA_SIZE
};
template <typename T> __device__ __forceinline__ int calcDist(T a, T b)
{
return ::abs(a - b);
}
template <class T> struct FastOptFlowBM
{
int search_radius;
int block_radius;
int search_window;
int block_window;
PtrStepSz<T> I0;
PtrStep<T> I1;
mutable PtrStepi buffer;
FastOptFlowBM(int search_window_, int block_window_,
PtrStepSz<T> I0_, PtrStepSz<T> I1_,
PtrStepi buffer_) :
search_radius(search_window_ / 2), block_radius(block_window_ / 2),
search_window(search_window_), block_window(block_window_),
I0(I0_), I1(I1_),
buffer(buffer_)
{
}
__device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
{
for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
{
dist_sums[index] = 0;
for (int tx = 0; tx < block_window; ++tx)
col_sums(tx, index) = 0;
int y = index / search_window;
int x = index - y * search_window;
int ay = i;
int ax = j;
int by = i + y - search_radius;
int bx = j + x - search_radius;
for (int tx = -block_radius; tx <= block_radius; ++tx)
{
int col_sum = 0;
for (int ty = -block_radius; ty <= block_radius; ++ty)
{
int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx));
dist_sums[index] += dist;
col_sum += dist;
}
col_sums(tx + block_radius, index) = col_sum;
}
up_col_sums(j, index) = col_sums(block_window - 1, index);
}
}
__device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
{
for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
{
int y = index / search_window;
int x = index - y * search_window;
int ay = i;
int ax = j + block_radius;
int by = i + y - search_radius;
int bx = j + x - search_radius + block_radius;
int col_sum = 0;
for (int ty = -block_radius; ty <= block_radius; ++ty)
col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx));
dist_sums[index] += col_sum - col_sums(first, index);
col_sums(first, index) = col_sum;
up_col_sums(j, index) = col_sum;
}
}
__device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
{
int ay = i;
int ax = j + block_radius;
T a_up = I0(ay - block_radius - 1, ax);
T a_down = I0(ay + block_radius, ax);
for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
{
int y = index / search_window;
int x = index - y * search_window;
int by = i + y - search_radius;
int bx = j + x - search_radius + block_radius;
T b_up = I1(by - block_radius - 1, bx);
T b_down = I1(by + block_radius, bx);
int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
dist_sums[index] += col_sum - col_sums(first, index);
col_sums(first, index) = col_sum;
up_col_sums(j, index) = col_sum;
}
}
__device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const
{
int bestDist = numeric_limits<int>::max();
int bestInd = -1;
for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
{
int curDist = dist_sums[index];
if (curDist < bestDist)
{
bestDist = curDist;
bestInd = index;
}
}
__shared__ int cta_dist_buffer[CTA_SIZE];
__shared__ int cta_ind_buffer[CTA_SIZE];
reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>());
if (threadIdx.x == 0)
{
int y = bestInd / search_window;
int x = bestInd - y * search_window;
velx = x - search_radius;
vely = y - search_radius;
}
}
__device__ __forceinline__ void operator()(PtrStepf velx, PtrStepf vely) const
{
int tbx = blockIdx.x * TILE_COLS;
int tby = blockIdx.y * TILE_ROWS;
int tex = ::min(tbx + TILE_COLS, I0.cols);
int tey = ::min(tby + TILE_ROWS, I0.rows);
PtrStepi col_sums;
col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
col_sums.step = buffer.step;
PtrStepi up_col_sums;
up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
up_col_sums.step = buffer.step;
extern __shared__ int dist_sums[]; //search_window * search_window
int first = 0;
for (int i = tby; i < tey; ++i)
{
for (int j = tbx; j < tex; ++j)
{
__syncthreads();
if (j == tbx)
{
initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
first = 0;
}
else
{
if (i == tby)
shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
else
shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
first = (first + 1) % block_window;
}
__syncthreads();
convolve_window(i, j, dist_sums, velx(i, j), vely(i, j));
}
}
}
};
template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely)
{
fbm(velx, vely);
}
void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
{
dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS));
buffer_cols = search_window * search_window * grid.y;
buffer_rows = src_cols + block_window * grid.x;
}
template <typename T>
void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream)
{
FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer);
dim3 block(CTA_SIZE, 1);
dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS));
size_t smem = search_window * search_window * sizeof(int);
optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely);
cudaSafeCall ( cudaGetLastError () );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
}
#endif // !defined CUDA_DISABLER

View File

@@ -0,0 +1,656 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#define tx threadIdx.x
#define ty threadIdx.y
#define bx blockIdx.x
#define by blockIdx.y
#define bdx blockDim.x
#define bdy blockDim.y
#define BORDER_SIZE 5
#define MAX_KSIZE_HALF 100
namespace cv { namespace cuda { namespace device { namespace optflow_farneback
{
__constant__ float c_g[8];
__constant__ float c_xg[8];
__constant__ float c_xxg[8];
__constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
template <int polyN>
__global__ void polynomialExpansion(
const int height, const int width, const PtrStepf src, PtrStepf dst)
{
const int y = by * bdy + ty;
const int x = bx * (bdx - 2*polyN) + tx - polyN;
if (y < height)
{
extern __shared__ float smem[];
volatile float *row = smem + tx;
int xWarped = ::min(::max(x, 0), width - 1);
row[0] = src(y, xWarped) * c_g[0];
row[bdx] = 0.f;
row[2*bdx] = 0.f;
for (int k = 1; k <= polyN; ++k)
{
float t0 = src(::max(y - k, 0), xWarped);
float t1 = src(::min(y + k, height - 1), xWarped);
row[0] += c_g[k] * (t0 + t1);
row[bdx] += c_xg[k] * (t1 - t0);
row[2*bdx] += c_xxg[k] * (t0 + t1);
}
__syncthreads();
if (tx >= polyN && tx + polyN < bdx && x < width)
{
float b1 = c_g[0] * row[0];
float b3 = c_g[0] * row[bdx];
float b5 = c_g[0] * row[2*bdx];
float b2 = 0, b4 = 0, b6 = 0;
for (int k = 1; k <= polyN; ++k)
{
b1 += (row[k] + row[-k]) * c_g[k];
b4 += (row[k] + row[-k]) * c_xxg[k];
b2 += (row[k] - row[-k]) * c_xg[k];
b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
}
dst(y, xWarped) = b3*c_ig11;
dst(height + y, xWarped) = b2*c_ig11;
dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
dst(4*height + y, xWarped) = b6*c_ig55;
}
}
}
void setPolynomialExpansionConsts(
int polyN, const float *g, const float *xg, const float *xxg,
float ig11, float ig03, float ig33, float ig55)
{
cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
}
void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
{
dim3 block(256);
dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
int smem = 3 * block.x * sizeof(float);
if (polyN == 5)
polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
else if (polyN == 7)
polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
__constant__ float c_border[BORDER_SIZE + 1];
__global__ void updateMatrices(
const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
const PtrStepf R0, const PtrStepf R1, PtrStepf M)
{
const int y = by * bdy + ty;
const int x = bx * bdx + tx;
if (y < height && x < width)
{
float dx = flowx(y, x);
float dy = flowy(y, x);
float fx = x + dx;
float fy = y + dy;
int x1 = floorf(fx);
int y1 = floorf(fy);
fx -= x1; fy -= y1;
float r2, r3, r4, r5, r6;
if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
{
float a00 = (1.f - fx) * (1.f - fy);
float a01 = fx * (1.f - fy);
float a10 = (1.f - fx) * fy;
float a11 = fx * fy;
r2 = a00 * R1(y1, x1) +
a01 * R1(y1, x1 + 1) +
a10 * R1(y1 + 1, x1) +
a11 * R1(y1 + 1, x1 + 1);
r3 = a00 * R1(height + y1, x1) +
a01 * R1(height + y1, x1 + 1) +
a10 * R1(height + y1 + 1, x1) +
a11 * R1(height + y1 + 1, x1 + 1);
r4 = a00 * R1(2*height + y1, x1) +
a01 * R1(2*height + y1, x1 + 1) +
a10 * R1(2*height + y1 + 1, x1) +
a11 * R1(2*height + y1 + 1, x1 + 1);
r5 = a00 * R1(3*height + y1, x1) +
a01 * R1(3*height + y1, x1 + 1) +
a10 * R1(3*height + y1 + 1, x1) +
a11 * R1(3*height + y1 + 1, x1 + 1);
r6 = a00 * R1(4*height + y1, x1) +
a01 * R1(4*height + y1, x1 + 1) +
a10 * R1(4*height + y1 + 1, x1) +
a11 * R1(4*height + y1 + 1, x1 + 1);
r4 = (R0(2*height + y, x) + r4) * 0.5f;
r5 = (R0(3*height + y, x) + r5) * 0.5f;
r6 = (R0(4*height + y, x) + r6) * 0.25f;
}
else
{
r2 = r3 = 0.f;
r4 = R0(2*height + y, x);
r5 = R0(3*height + y, x);
r6 = R0(4*height + y, x) * 0.5f;
}
r2 = (R0(y, x) - r2) * 0.5f;
r3 = (R0(height + y, x) - r3) * 0.5f;
r2 += r4*dy + r6*dx;
r3 += r6*dy + r5*dx;
float scale =
c_border[::min(x, BORDER_SIZE)] *
c_border[::min(y, BORDER_SIZE)] *
c_border[::min(width - x - 1, BORDER_SIZE)] *
c_border[::min(height - y - 1, BORDER_SIZE)];
r2 *= scale; r3 *= scale; r4 *= scale;
r5 *= scale; r6 *= scale;
M(y, x) = r4*r4 + r6*r6;
M(height + y, x) = (r4 + r5)*r6;
M(2*height + y, x) = r5*r5 + r6*r6;
M(3*height + y, x) = r4*r2 + r6*r3;
M(4*height + y, x) = r6*r2 + r5*r3;
}
}
void setUpdateMatricesConsts()
{
static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
}
void updateMatricesGpu(
const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
PtrStepSzf M, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
__global__ void updateFlow(
const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
{
const int y = by * bdy + ty;
const int x = bx * bdx + tx;
if (y < height && x < width)
{
float g11 = M(y, x);
float g12 = M(height + y, x);
float g22 = M(2*height + y, x);
float h1 = M(3*height + y, x);
float h2 = M(4*height + y, x);
float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
flowx(y, x) = (g11*h2 - g12*h1) * detInv;
flowy(y, x) = (g22*h1 - g12*h2) * detInv;
}
}
void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
/*__global__ void boxFilter(
const int height, const int width, const PtrStepf src,
const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
{
const int y = by * bdy + ty;
const int x = bx * bdx + tx;
extern __shared__ float smem[];
volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
if (y < height)
{
// Vertical pass
for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
{
int xExt = int(bx * bdx) + i - ksizeHalf;
xExt = ::min(::max(xExt, 0), width - 1);
row[i] = src(y, xExt);
for (int j = 1; j <= ksizeHalf; ++j)
row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
}
if (x < width)
{
__syncthreads();
// Horizontal passs
row += tx + ksizeHalf;
float res = row[0];
for (int i = 1; i <= ksizeHalf; ++i)
res += row[-i] + row[i];
dst(y, x) = res * boxAreaInv;
}
}
}
void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
{
dim3 block(256);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}*/
__global__ void boxFilter5(
const int height, const int width, const PtrStepf src,
const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
{
const int y = by * bdy + ty;
const int x = bx * bdx + tx;
extern __shared__ float smem[];
const int smw = bdx + 2*ksizeHalf; // shared memory "width"
volatile float *row = smem + 5 * ty * smw;
if (y < height)
{
// Vertical pass
for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
{
int xExt = int(bx * bdx) + i - ksizeHalf;
xExt = ::min(::max(xExt, 0), width - 1);
#pragma unroll
for (int k = 0; k < 5; ++k)
row[k*smw + i] = src(k*height + y, xExt);
for (int j = 1; j <= ksizeHalf; ++j)
#pragma unroll
for (int k = 0; k < 5; ++k)
row[k*smw + i] +=
src(k*height + ::max(y - j, 0), xExt) +
src(k*height + ::min(y + j, height - 1), xExt);
}
if (x < width)
{
__syncthreads();
// Horizontal passs
row += tx + ksizeHalf;
float res[5];
#pragma unroll
for (int k = 0; k < 5; ++k)
res[k] = row[k*smw];
for (int i = 1; i <= ksizeHalf; ++i)
#pragma unroll
for (int k = 0; k < 5; ++k)
res[k] += row[k*smw - i] + row[k*smw + i];
#pragma unroll
for (int k = 0; k < 5; ++k)
dst(k*height + y, x) = res[k] * boxAreaInv;
}
}
}
void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
{
int height = src.rows / 5;
int width = src.cols;
dim3 block(256);
dim3 grid(divUp(width, block.x), divUp(height, block.y));
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
{
int height = src.rows / 5;
int width = src.cols;
dim3 block(128);
dim3 grid(divUp(width, block.x), divUp(height, block.y));
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
__constant__ float c_gKer[MAX_KSIZE_HALF + 1];
template <typename Border>
__global__ void gaussianBlur(
const int height, const int width, const PtrStepf src, const int ksizeHalf,
const Border b, PtrStepf dst)
{
const int y = by * bdy + ty;
const int x = bx * bdx + tx;
extern __shared__ float smem[];
volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
if (y < height)
{
// Vertical pass
for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
{
int xExt = int(bx * bdx) + i - ksizeHalf;
xExt = b.idx_col(xExt);
row[i] = src(y, xExt) * c_gKer[0];
for (int j = 1; j <= ksizeHalf; ++j)
row[i] +=
(src(b.idx_row_low(y - j), xExt) +
src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
}
if (x < width)
{
__syncthreads();
// Horizontal pass
row += tx + ksizeHalf;
float res = row[0] * c_gKer[0];
for (int i = 1; i <= ksizeHalf; ++i)
res += (row[-i] + row[i]) * c_gKer[i];
dst(y, x) = res;
}
}
}
void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
{
cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
}
template <typename Border>
void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
{
int height = src.rows;
int width = src.cols;
dim3 block(256);
dim3 grid(divUp(width, block.x), divUp(height, block.y));
int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
Border b(height, width);
gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void gaussianBlurGpu(
const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
{
typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
static const caller_t callers[] =
{
0 /*gaussianBlurCaller<BrdConstant<float> >*/,
gaussianBlurCaller<BrdReplicate<float> >,
0 /*gaussianBlurCaller<BrdReflect<float> >*/,
0 /*gaussianBlurCaller<BrdWrap<float> >*/,
gaussianBlurCaller<BrdReflect101<float> >
};
callers[borderMode](src, ksizeHalf, dst, stream);
}
template <typename Border>
__global__ void gaussianBlur5(
const int height, const int width, const PtrStepf src, const int ksizeHalf,
const Border b, PtrStepf dst)
{
const int y = by * bdy + ty;
const int x = bx * bdx + tx;
extern __shared__ float smem[];
const int smw = bdx + 2*ksizeHalf; // shared memory "width"
volatile float *row = smem + 5 * ty * smw;
if (y < height)
{
// Vertical pass
for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
{
int xExt = int(bx * bdx) + i - ksizeHalf;
xExt = b.idx_col(xExt);
#pragma unroll
for (int k = 0; k < 5; ++k)
row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
for (int j = 1; j <= ksizeHalf; ++j)
#pragma unroll
for (int k = 0; k < 5; ++k)
row[k*smw + i] +=
(src(k*height + b.idx_row_low(y - j), xExt) +
src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
}
if (x < width)
{
__syncthreads();
// Horizontal pass
row += tx + ksizeHalf;
float res[5];
#pragma unroll
for (int k = 0; k < 5; ++k)
res[k] = row[k*smw] * c_gKer[0];
for (int i = 1; i <= ksizeHalf; ++i)
#pragma unroll
for (int k = 0; k < 5; ++k)
res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
#pragma unroll
for (int k = 0; k < 5; ++k)
dst(k*height + y, x) = res[k];
}
}
}
template <typename Border, int blockDimX>
void gaussianBlur5Caller(
const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
{
int height = src.rows / 5;
int width = src.cols;
dim3 block(blockDimX);
dim3 grid(divUp(width, block.x), divUp(height, block.y));
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
Border b(height, width);
gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void gaussianBlur5Gpu(
const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
{
typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
static const caller_t callers[] =
{
0 /*gaussianBlur5Caller<BrdConstant<float>,256>*/,
gaussianBlur5Caller<BrdReplicate<float>,256>,
0 /*gaussianBlur5Caller<BrdReflect<float>,256>*/,
0 /*gaussianBlur5Caller<BrdWrap<float>,256>*/,
gaussianBlur5Caller<BrdReflect101<float>,256>
};
callers[borderMode](src, ksizeHalf, dst, stream);
}
void gaussianBlur5Gpu_CC11(
const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
{
typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
static const caller_t callers[] =
{
0 /*gaussianBlur5Caller<BrdConstant<float>,128>*/,
gaussianBlur5Caller<BrdReplicate<float>,128>,
0 /*gaussianBlur5Caller<BrdReflect<float>,128>*/,
0 /*gaussianBlur5Caller<BrdWrap<float>,128>*/,
gaussianBlur5Caller<BrdReflect101<float>,128>
};
callers[borderMode](src, ksizeHalf, dst, stream);
}
}}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,220 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace cuda { namespace device
{
namespace optical_flow
{
#define NEEDLE_MAP_SCALE 16
#define NUM_VERTS_PER_ARROW 6
__global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
{
__shared__ float smem[2 * NEEDLE_MAP_SCALE];
volatile float* u_col_sum = smem;
volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
const int y = blockIdx.y * NEEDLE_MAP_SCALE;
u_col_sum[threadIdx.x] = 0;
v_col_sum[threadIdx.x] = 0;
#pragma unroll
for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
{
u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
}
if (threadIdx.x < 8)
{
// now add the column sums
const uint X = threadIdx.x;
if (X | 0xfe == 0xfe) // bit 0 is 0
{
u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
}
if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
{
u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
}
if (X | 0xf8 == 0xf8)
{
u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
}
if (X == 0)
{
u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
}
}
if (threadIdx.x == 0)
{
const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
u_col_sum[0] *= coeff;
v_col_sum[0] *= coeff;
u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
}
}
void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
{
const dim3 block(NEEDLE_MAP_SCALE);
const dim3 grid(u_avg.cols, u_avg.rows);
NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
{
// test - just draw a triangle at each pixel
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
float3 v[NUM_VERTS_PER_ARROW];
if (x < u_avg.cols && y < u_avg.rows)
{
const float u_avg_val = u_avg(y, x);
const float v_avg_val = v_avg(y, x);
const float theta = ::atan2f(v_avg_val, u_avg_val);
float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
r = fmin(14.0f * (r / max_flow), 14.0f);
v[0].z = 1.0f;
v[1].z = 0.7f;
v[2].z = 0.7f;
v[3].z = 0.7f;
v[4].z = 0.7f;
v[5].z = 1.0f;
v[0].x = arrow_x;
v[0].y = arrow_y;
v[5].x = arrow_x;
v[5].y = arrow_y;
v[2].x = arrow_x + r * ::cosf(theta);
v[2].y = arrow_y + r * ::sinf(theta);
v[3].x = v[2].x;
v[3].y = v[2].y;
r = ::fmin(r, 2.5f);
v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
vertex_data[indx++] = v[0].x * xscale;
vertex_data[indx++] = v[0].y * yscale;
vertex_data[indx++] = v[0].z;
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
vertex_data[indx++] = v[1].x * xscale;
vertex_data[indx++] = v[1].y * yscale;
vertex_data[indx++] = v[1].z;
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
vertex_data[indx++] = v[2].x * xscale;
vertex_data[indx++] = v[2].y * yscale;
vertex_data[indx++] = v[2].z;
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
vertex_data[indx++] = v[3].x * xscale;
vertex_data[indx++] = v[3].y * yscale;
vertex_data[indx++] = v[3].z;
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
vertex_data[indx++] = v[4].x * xscale;
vertex_data[indx++] = v[4].y * yscale;
vertex_data[indx++] = v[4].z;
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
vertex_data[indx++] = v[5].x * xscale;
vertex_data[indx++] = v[5].y * yscale;
vertex_data[indx++] = v[5].z;
}
}
void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
{
const dim3 block(16);
const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
}}}
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,560 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/utility.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/reduce.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace pyrlk
{
__constant__ int c_winSize_x;
__constant__ int c_winSize_y;
__constant__ int c_halfWin_x;
__constant__ int c_halfWin_y;
__constant__ int c_iters;
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
template <int cn> struct Tex_I;
template <> struct Tex_I<1>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_If, x, y);
}
};
template <> struct Tex_I<4>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_If4, x, y);
}
};
template <int cn> struct Tex_J;
template <> struct Tex_J<1>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_Jf, x, y);
}
};
template <> struct Tex_J<4>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_Jf4, x, y);
}
};
__device__ __forceinline__ void accum(float& dst, float val)
{
dst += val;
}
__device__ __forceinline__ void accum(float& dst, const float4& val)
{
dst += val.x + val.y + val.z;
}
__device__ __forceinline__ float abs_(float a)
{
return ::fabsf(a);
}
__device__ __forceinline__ float4 abs_(const float4& a)
{
return abs(a);
}
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
__global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{
#if __CUDA_ARCH__ <= 110
const int BLOCK_SIZE = 128;
#else
const int BLOCK_SIZE = 256;
#endif
__shared__ float smem1[BLOCK_SIZE];
__shared__ float smem2[BLOCK_SIZE];
__shared__ float smem3[BLOCK_SIZE];
const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
float2 prevPt = prevPts[blockIdx.x];
prevPt.x *= (1.0f / (1 << level));
prevPt.y *= (1.0f / (1 << level));
if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
prevPt.x -= c_halfWin_x;
prevPt.y -= c_halfWin_y;
// extract the patch from the first image, compute covariation matrix of derivatives
float A11 = 0;
float A12 = 0;
float A22 = 0;
typedef typename TypeVec<float, cn>::vec_type work_type;
work_type I_patch [PATCH_Y][PATCH_X];
work_type dIdx_patch[PATCH_Y][PATCH_X];
work_type dIdy_patch[PATCH_Y][PATCH_X];
for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
{
for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
{
float x = prevPt.x + xBase + 0.5f;
float y = prevPt.y + yBase + 0.5f;
I_patch[i][j] = Tex_I<cn>::read(x, y);
// Sharr Deriv
work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy;
accum(A11, dIdx * dIdx);
accum(A12, dIdx * dIdy);
accum(A22, dIdy * dIdy);
}
}
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
smem1[0] = A11;
smem2[0] = A12;
smem3[0] = A22;
}
#endif
__syncthreads();
A11 = smem1[0];
A12 = smem2[0];
A22 = smem3[0];
float D = A11 * A22 - A12 * A12;
if (D < numeric_limits<float>::epsilon())
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
D = 1.f / D;
A11 *= D;
A12 *= D;
A22 *= D;
float2 nextPt = nextPts[blockIdx.x];
nextPt.x *= 2.f;
nextPt.y *= 2.f;
nextPt.x -= c_halfWin_x;
nextPt.y -= c_halfWin_y;
for (int k = 0; k < c_iters; ++k)
{
if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
float b1 = 0;
float b2 = 0;
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type diff = (J_val - I_val) * 32.0f;
accum(b1, diff * dIdx_patch[i][j]);
accum(b2, diff * dIdy_patch[i][j]);
}
}
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
smem1[0] = b1;
smem2[0] = b2;
}
#endif
__syncthreads();
b1 = smem1[0];
b2 = smem2[0];
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
nextPt.x += delta.x;
nextPt.y += delta.y;
if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
break;
}
float errval = 0;
if (calcErr)
{
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type diff = J_val - I_val;
accum(errval, abs_(diff));
}
}
reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
}
if (tid == 0)
{
nextPt.x += c_halfWin_x;
nextPt.y += c_halfWin_y;
nextPts[blockIdx.x] = nextPt;
if (calcErr)
err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
}
}
template <int cn, int PATCH_X, int PATCH_Y>
void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
else
sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <bool calcErr>
__global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
{
extern __shared__ int smem[];
const int patchWidth = blockDim.x + 2 * c_halfWin_x;
const int patchHeight = blockDim.y + 2 * c_halfWin_y;
int* I_patch = smem;
int* dIdx_patch = I_patch + patchWidth * patchHeight;
int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
const int xBase = blockIdx.x * blockDim.x;
const int yBase = blockIdx.y * blockDim.y;
for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
{
for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
{
float x = xBase - c_halfWin_x + j + 0.5f;
float y = yBase - c_halfWin_y + i + 0.5f;
I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
// Sharr Deriv
dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
(3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
(3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
}
}
__syncthreads();
const int x = xBase + threadIdx.x;
const int y = yBase + threadIdx.y;
if (x >= cols || y >= rows)
return;
int A11i = 0;
int A12i = 0;
int A22i = 0;
for (int i = 0; i < c_winSize_y; ++i)
{
for (int j = 0; j < c_winSize_x; ++j)
{
int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
A11i += dIdx * dIdx;
A12i += dIdx * dIdy;
A22i += dIdy * dIdy;
}
}
float A11 = A11i;
float A12 = A12i;
float A22 = A22i;
float D = A11 * A22 - A12 * A12;
if (D < numeric_limits<float>::epsilon())
{
if (calcErr)
err(y, x) = numeric_limits<float>::max();
return;
}
D = 1.f / D;
A11 *= D;
A12 *= D;
A22 *= D;
float2 nextPt;
nextPt.x = x + prevU(y/2, x/2) * 2.0f;
nextPt.y = y + prevV(y/2, x/2) * 2.0f;
for (int k = 0; k < c_iters; ++k)
{
if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
{
if (calcErr)
err(y, x) = numeric_limits<float>::max();
return;
}
int b1 = 0;
int b2 = 0;
for (int i = 0; i < c_winSize_y; ++i)
{
for (int j = 0; j < c_winSize_x; ++j)
{
int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
int diff = (J - I) * 32;
int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
b1 += diff * dIdx;
b2 += diff * dIdy;
}
}
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
nextPt.x += delta.x;
nextPt.y += delta.y;
if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
break;
}
u(y, x) = nextPt.x - x;
v(y, x) = nextPt.y - y;
if (calcErr)
{
int errval = 0;
for (int i = 0; i < c_winSize_y; ++i)
{
for (int j = 0; j < c_winSize_x; ++j)
{
int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
errval += ::abs(J - I);
}
}
err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
}
}
void loadConstants(int2 winSize, int iters)
{
cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
}
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream)
{
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream);
static const func_t funcs[5][5] =
{
{sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
{sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
{sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
{sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
{sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
};
bindTexture(&tex_If, I);
bindTexture(&tex_Jf, J);
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
}
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream)
{
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream);
static const func_t funcs[5][5] =
{
{sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
{sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
{sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
{sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
{sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
};
bindTexture(&tex_If4, I);
bindTexture(&tex_Jf4, J);
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
}
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
{
dim3 block(16, 16);
dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
bindTexture(&tex_Ib, I);
bindTexture(&tex_Jf, J);
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
const int patchWidth = block.x + 2 * halfWin.x;
const int patchHeight = block.y + 2 * halfWin.y;
size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
if (err.data)
{
denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
cudaSafeCall( cudaGetLastError() );
}
else
{
denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,332 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
#include "opencv2/core/cuda/limits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
////////////////////////////////////////////////////////////
// centeredGradient
namespace tvl1flow
{
__global__ void centeredGradientKernel(const PtrStepSzf src, PtrStepf dx, PtrStepf dy)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= src.cols || y >= src.rows)
return;
dx(y, x) = 0.5f * (src(y, ::min(x + 1, src.cols - 1)) - src(y, ::max(x - 1, 0)));
dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
}
void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
centeredGradientKernel<<<grid, block>>>(src, dx, dy);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
////////////////////////////////////////////////////////////
// warpBackward
namespace tvl1flow
{
static __device__ __forceinline__ float bicubicCoeff(float x_)
{
float x = fabsf(x_);
if (x <= 1.0f)
{
return x * x * (1.5f * x - 2.5f) + 1.0f;
}
else if (x < 2.0f)
{
return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
}
else
{
return 0.0f;
}
}
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= I0.cols || y >= I0.rows)
return;
const float u1Val = u1(y, x);
const float u2Val = u2(y, x);
const float wx = x + u1Val;
const float wy = y + u2Val;
const int xmin = ::ceilf(wx - 2.0f);
const int xmax = ::floorf(wx + 2.0f);
const int ymin = ::ceilf(wy - 2.0f);
const int ymax = ::floorf(wy + 2.0f);
float sum = 0.0f;
float sumx = 0.0f;
float sumy = 0.0f;
float wsum = 0.0f;
for (int cy = ymin; cy <= ymax; ++cy)
{
for (int cx = xmin; cx <= xmax; ++cx)
{
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
sum += w * tex2D(tex_I1 , cx, cy);
sumx += w * tex2D(tex_I1x, cx, cy);
sumy += w * tex2D(tex_I1y, cx, cy);
wsum += w;
}
}
const float coeff = 1.0f / wsum;
const float I1wVal = sum * coeff;
const float I1wxVal = sumx * coeff;
const float I1wyVal = sumy * coeff;
I1w(y, x) = I1wVal;
I1wx(y, x) = I1wxVal;
I1wy(y, x) = I1wyVal;
const float Ix2 = I1wxVal * I1wxVal;
const float Iy2 = I1wyVal * I1wyVal;
// store the |Grad(I1)|^2
grad(y, x) = Ix2 + Iy2;
// compute the constant part of the rho function
const float I0Val = I0(y, x);
rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
}
void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
{
const dim3 block(32, 8);
const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
bindTexture(&tex_I1 , I1);
bindTexture(&tex_I1x, I1x);
bindTexture(&tex_I1y, I1y);
warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
////////////////////////////////////////////////////////////
// estimateU
namespace tvl1flow
{
__device__ float divergence(const PtrStepf& v1, const PtrStepf& v2, int y, int x)
{
if (x > 0 && y > 0)
{
const float v1x = v1(y, x) - v1(y, x - 1);
const float v2y = v2(y, x) - v2(y - 1, x);
return v1x + v2y;
}
else
{
if (y > 0)
return v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
else
{
if (x > 0)
return v1(0, x) - v1(0, x - 1) + v2(0, x);
else
return v1(0, 0) + v2(0, 0);
}
}
}
__global__ void estimateUKernel(const PtrStepSzf I1wx, const PtrStepf I1wy,
const PtrStepf grad, const PtrStepf rho_c,
const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22,
PtrStepf u1, PtrStepf u2, PtrStepf error,
const float l_t, const float theta)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= I1wx.cols || y >= I1wx.rows)
return;
const float I1wxVal = I1wx(y, x);
const float I1wyVal = I1wy(y, x);
const float gradVal = grad(y, x);
const float u1OldVal = u1(y, x);
const float u2OldVal = u2(y, x);
const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
// estimate the values of the variable (v1, v2) (thresholding operator TH)
float d1 = 0.0f;
float d2 = 0.0f;
if (rho < -l_t * gradVal)
{
d1 = l_t * I1wxVal;
d2 = l_t * I1wyVal;
}
else if (rho > l_t * gradVal)
{
d1 = -l_t * I1wxVal;
d2 = -l_t * I1wyVal;
}
else if (gradVal > numeric_limits<float>::epsilon())
{
const float fi = -rho / gradVal;
d1 = fi * I1wxVal;
d2 = fi * I1wyVal;
}
const float v1 = u1OldVal + d1;
const float v2 = u2OldVal + d2;
// compute the divergence of the dual variable (p1, p2)
const float div_p1 = divergence(p11, p12, y, x);
const float div_p2 = divergence(p21, p22, y, x);
// estimate the values of the optical flow (u1, u2)
const float u1NewVal = v1 + theta * div_p1;
const float u2NewVal = v2 + theta * div_p2;
u1(y, x) = u1NewVal;
u2(y, x) = u2NewVal;
const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
error(y, x) = n1 + n2;
}
void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
PtrStepSzf grad, PtrStepSzf rho_c,
PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
float l_t, float theta)
{
const dim3 block(32, 8);
const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
////////////////////////////////////////////////////////////
// estimateDualVariables
namespace tvl1flow
{
__global__ void estimateDualVariablesKernel(const PtrStepSzf u1, const PtrStepf u2, PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, const float taut)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= u1.cols || y >= u1.rows)
return;
const float u1x = u1(y, ::min(x + 1, u1.cols - 1)) - u1(y, x);
const float u1y = u1(::min(y + 1, u1.rows - 1), x) - u1(y, x);
const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x);
const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x);
const float g1 = ::hypotf(u1x, u1y);
const float g2 = ::hypotf(u2x, u2y);
const float ng1 = 1.0f + taut * g1;
const float ng2 = 1.0f + taut * g2;
p11(y, x) = (p11(y, x) + taut * u1x) / ng1;
p12(y, x) = (p12(y, x) + taut * u1y) / ng1;
p21(y, x) = (p21(y, x) + taut * u2x) / ng2;
p22(y, x) = (p22(y, x) + taut * u2y) / ng2;
}
void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut)
{
const dim3 block(32, 8);
const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
estimateDualVariablesKernel<<<grid, block>>>(u1, u2, p11, p12, p21, p22, taut);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
#endif // !defined CUDA_DISABLER