removed BEGIN_OPENCV_DEVICE_NAMESPACE macros

This commit is contained in:
Vladislav Vinogradov
2011-11-14 09:02:06 +00:00
parent d926541311
commit 0f53f2993e
73 changed files with 19272 additions and 19504 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -45,423 +45,421 @@
#include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace bf_radius_match {
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
namespace cv { namespace gpu { namespace device
{
#if __CUDA_ARCH__ >= 110
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
namespace bf_radius_match
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
if (loadX < query.cols)
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
T val;
#if __CUDA_ARCH__ >= 110
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
extern __shared__ int smem[];
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
if (loadX < query.cols)
{
T val;
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
float distVal = (typename Dist::result_type)dist;
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
{
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (ind < maxCount)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
#endif
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
float distVal = (typename Dist::result_type)dist;
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
{
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (ind < maxCount)
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
#endif
}
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
for (int i = 0; i < n; ++i)
{
const DevMem2D_<T> train = trains[i];
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
if (masks != 0 && masks[i].data)
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream)
{
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
else
{
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
cudaSafeCall( cudaGetLastError() );
}
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
///////////////////////////////////////////////////////////////////////////////
// Match
for (int i = 0; i < n; ++i)
{
const DevMem2D_<T> train = trains[i];
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
#if __CUDA_ARCH__ >= 110
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
extern __shared__ int smem[];
if (masks != 0 && masks[i].data)
{
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
else
{
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
cudaSafeCall( cudaGetLastError() );
}
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Dist dist;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
if (loadX < query.cols)
{
T val;
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__syncthreads();
///////////////////////////////////////////////////////////////////////////////
// Match
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
float distVal = (typename Dist::result_type)dist;
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
{
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (ind < maxCount)
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
#if __CUDA_ARCH__ >= 110
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Dist dist;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
if (loadX < query.cols)
{
T val;
ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
float distVal = (typename Dist::result_type)dist;
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
{
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (ind < maxCount)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
#endif
}
}
#endif
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, typename Dist, typename T>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
for (int i = 0; i < n; ++i)
{
const DevMem2D_<T> train = trains[i];
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
if (masks != 0 && masks[i].data)
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream)
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
else
template <int BLOCK_SIZE, typename Dist, typename T>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream)
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
for (int i = 0; i < n; ++i)
{
const DevMem2D_<T> train = trains[i];
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
if (masks != 0 && masks[i].data)
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
else
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match dispatcher
///////////////////////////////////////////////////////////////////////////////
// Match dispatcher
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 128)
{
matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}*/
else
{
match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
}
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 128)
{
matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}*/
else
{
match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
}
template <typename Dist, typename T>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 128)
{
matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}*/
else
{
match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
}
template <typename Dist, typename T>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 128)
{
matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
/*else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}*/
else
{
match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match caller
///////////////////////////////////////////////////////////////////////////////
// Radius Match caller
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc, stream);
}
else
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc, stream);
}
}
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc, stream);
}
else
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc, stream);
}
}
template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc, stream);
}
else
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc, stream);
}
}
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc, stream);
}
else
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc, stream);
}
}
//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc, stream);
}
else
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc, stream);
}
}
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (mask.data)
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc, stream);
}
else
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc, stream);
}
}
template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
}
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
}
template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
}
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
}
//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
}
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
}
template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
} // namespace bf_radius_match
END_OPENCV_DEVICE_NAMESPACE
template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
} // namespace bf_radius_match
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -43,186 +43,184 @@
#include "internal_shared.hpp"
#include "opencv2/gpu/device/limits.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace bilateral_filter {
__constant__ float* ctable_color;
__constant__ float* ctable_space;
__constant__ size_t ctable_space_step;
__constant__ int cndisp;
__constant__ int cradius;
__constant__ short cedge_disc;
__constant__ short cmax_disc;
void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
namespace cv { namespace gpu { namespace device
{
cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
}
template <int channels>
struct DistRgbMax
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
namespace bilateral_filter
{
uchar x = ::abs(a[0] - b[0]);
uchar y = ::abs(a[1] - b[1]);
uchar z = ::abs(a[2] - b[2]);
return (::max(::max(x, y), z));
}
};
__constant__ float* ctable_color;
__constant__ float* ctable_space;
__constant__ size_t ctable_space_step;
template <>
struct DistRgbMax<1>
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
return ::abs(a[0] - b[0]);
}
};
__constant__ int cndisp;
__constant__ int cradius;
template <int channels, typename T>
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
__constant__ short cedge_disc;
__constant__ short cmax_disc;
T dp[5];
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
{
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
{
const int ymin = ::max(0, y - cradius);
const int xmin = ::max(0, x - cradius);
const int ymax = ::min(h - 1, y + cradius);
const int xmax = ::min(w - 1, x + cradius);
cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
const uchar* ic = img + y * img_step + channels * x;
cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
}
for(int yi = ymin; yi <= ymax; yi++)
template <int channels>
struct DistRgbMax
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
const T* disp_y = disp + yi * disp_step;
uchar x = ::abs(a[0] - b[0]);
uchar y = ::abs(a[1] - b[1]);
uchar z = ::abs(a[2] - b[2]);
return (::max(::max(x, y), z));
}
};
for(int xi = xmin; xi <= xmax; xi++)
template <>
struct DistRgbMax<1>
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
return ::abs(a[0] - b[0]);
}
};
template <int channels, typename T>
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
T dp[5];
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
{
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
{
const uchar* in = img + yi * img_step + channels * xi;
const int ymin = ::max(0, y - cradius);
const int xmin = ::max(0, x - cradius);
const int ymax = ::min(h - 1, y + cradius);
const int xmax = ::min(w - 1, x + cradius);
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
const uchar* ic = img + y * img_step + channels * x;
const T disp_reg = disp_y[xi];
for(int yi = ymin; yi <= ymax; yi++)
{
const T* disp_y = disp + yi * disp_step;
cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
for(int xi = xmin; xi <= xmax; xi++)
{
const uchar* in = img + yi * img_step + channels * xi;
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
const T disp_reg = disp_y[xi];
cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
}
}
float minimum = numeric_limits<float>::max();
int id = 0;
if (cost[0] < minimum)
{
minimum = cost[0];
id = 0;
}
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(disp + y * disp_step + x) = dp[id];
}
}
float minimum = numeric_limits<float>::max();
int id = 0;
if (cost[0] < minimum)
{
minimum = cost[0];
id = 0;
}
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(disp + y * disp_step + x) = dp[id];
}
}
}
template <typename T>
void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x << 1);
grid.y = divUp(disp.rows, threads.y);
switch (channels)
{
case 1:
for (int i = 0; i < iters; ++i)
template <typename T>
void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x << 1);
grid.y = divUp(disp.rows, threads.y);
bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
switch (channels)
{
case 1:
for (int i = 0; i < iters; ++i)
{
bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
case 3:
for (int i = 0; i < iters; ++i)
{
bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
default:
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
if (stream != 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
break;
case 3:
for (int i = 0; i < iters; ++i)
void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bilateral_filter_caller(disp, img, channels, iters, stream);
}
break;
default:
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
if (stream != 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter_caller(disp, img, channels, iters, stream);
}
void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter_caller(disp, img, channels, iters, stream);
}
} // namespace bilateral_filter
END_OPENCV_DEVICE_NAMESPACE
void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter_caller(disp, img, channels, iters, stream);
}
} // namespace bilateral_filter
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -42,77 +42,75 @@
#include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace blend {
template <typename T>
__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
namespace cv { namespace gpu { namespace device
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < rows && x < cols)
namespace blend
{
int x_ = x / cn;
float w1 = weights1.ptr(y)[x_];
float w2 = weights2.ptr(y)[x_];
T p1 = img1.ptr(y)[x];
T p2 = img2.ptr(y)[x];
result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
}
}
template <typename T>
__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
template <typename T>
void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
cudaSafeCall( cudaGetLastError() );
if (y < rows && x < cols)
{
int x_ = x / cn;
float w1 = weights1.ptr(y)[x_];
float w2 = weights2.ptr(y)[x_];
T p1 = img1.ptr(y)[x];
T p2 = img2.ptr(y)[x];
result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
}
}
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
cudaSafeCall( cudaGetLastError() );
template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < rows && x < cols)
{
float w1 = weights1.ptr(y)[x];
float w2 = weights2.ptr(y)[x];
float sum_inv = 1.f / (w1 + w2 + 1e-5f);
w1 *= sum_inv;
w2 *= sum_inv;
uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
}
}
if (y < rows && x < cols)
{
float w1 = weights1.ptr(y)[x];
float w2 = weights2.ptr(y)[x];
float sum_inv = 1.f / (w1 + w2 + 1e-5f);
w1 *= sum_inv;
w2 *= sum_inv;
uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
}
}
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
cudaSafeCall( cudaGetLastError() );
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
} // namespace blend
END_OPENCV_DEVICE_NAMESPACE
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
} // namespace blend
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -44,149 +44,148 @@
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
namespace transform_points
namespace cv { namespace gpu { namespace device
{
__constant__ float3 crot0;
__constant__ float3 crot1;
__constant__ float3 crot2;
__constant__ float3 ctransl;
#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
struct TransformOp : unary_function<float3, float3>
namespace transform_points
{
__device__ __forceinline__ float3 operator()(const float3& p) const
__constant__ float3 crot0;
__constant__ float3 crot1;
__constant__ float3 crot2;
__constant__ float3 ctransl;
struct TransformOp : unary_function<float3, float3>
{
return make_float3(
crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
__device__ __forceinline__ float3 operator()(const float3& p) const
{
return make_float3(
crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
}
};
void call(const DevMem2D_<float3> src, const float* rot,
const float* transl, DevMem2D_<float3> dst,
cudaStream_t stream)
{
cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
::cv::gpu::device::transform(src, dst, TransformOp(), stream);
}
};
} // namespace transform_points
void call(const DevMem2D_<float3> src, const float* rot,
const float* transl, DevMem2D_<float3> dst,
cudaStream_t stream)
namespace project_points
{
cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream);
}
} // namespace transform_points
__constant__ float3 crot0;
__constant__ float3 crot1;
__constant__ float3 crot2;
__constant__ float3 ctransl;
__constant__ float3 cproj0;
__constant__ float3 cproj1;
namespace project_points
{
__constant__ float3 crot0;
__constant__ float3 crot1;
__constant__ float3 crot2;
__constant__ float3 ctransl;
__constant__ float3 cproj0;
__constant__ float3 cproj1;
struct ProjectOp : unary_function<float3, float3>
{
__device__ __forceinline__ float2 operator()(const float3& p) const
struct ProjectOp : unary_function<float3, float3>
{
// Rotate and translate in 3D
float3 t = make_float3(
crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
// Project on 2D plane
return make_float2(
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
__device__ __forceinline__ float2 operator()(const float3& p) const
{
// Rotate and translate in 3D
float3 t = make_float3(
crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
// Project on 2D plane
return make_float2(
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
}
};
void call(const DevMem2D_<float3> src, const float* rot,
const float* transl, const float* proj, DevMem2D_<float2> dst,
cudaStream_t stream)
{
cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
::cv::gpu::device::transform(src, dst, ProjectOp(), stream);
}
};
} // namespace project_points
void call(const DevMem2D_<float3> src, const float* rot,
const float* transl, const float* proj, DevMem2D_<float2> dst,
cudaStream_t stream)
namespace solve_pnp_ransac
{
cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream);
}
} // namespace project_points
__constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
__constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
namespace solve_pnp_ransac
{
__constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
__constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
int maxNumIters()
{
return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
}
__device__ __forceinline__ float sqr(float x)
{
return x * x;
}
__global__ void computeHypothesisScoresKernel(
const int num_points, const float3* object, const float2* image,
const float dist_threshold, int* g_num_inliers)
{
const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
const float3 &transl_vec = ctransl_vectors[blockIdx.x];
int num_inliers = 0;
for (int i = threadIdx.x; i < num_points; i += blockDim.x)
int maxNumIters()
{
float3 p = object[i];
p = make_float3(
rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
p.x /= p.z;
p.y /= p.z;
float2 image_p = image[i];
if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
++num_inliers;
return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
}
extern __shared__ float s_num_inliers[];
s_num_inliers[threadIdx.x] = num_inliers;
__syncthreads();
for (int step = blockDim.x / 2; step > 0; step >>= 1)
__device__ __forceinline__ float sqr(float x)
{
if (threadIdx.x < step)
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
return x * x;
}
__global__ void computeHypothesisScoresKernel(
const int num_points, const float3* object, const float2* image,
const float dist_threshold, int* g_num_inliers)
{
const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
const float3 &transl_vec = ctransl_vectors[blockIdx.x];
int num_inliers = 0;
for (int i = threadIdx.x; i < num_points; i += blockDim.x)
{
float3 p = object[i];
p = make_float3(
rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
p.x /= p.z;
p.y /= p.z;
float2 image_p = image[i];
if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
++num_inliers;
}
extern __shared__ float s_num_inliers[];
s_num_inliers[threadIdx.x] = num_inliers;
__syncthreads();
for (int step = blockDim.x / 2; step > 0; step >>= 1)
{
if (threadIdx.x < step)
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
__syncthreads();
}
if (threadIdx.x == 0)
g_num_inliers[blockIdx.x] = s_num_inliers[0];
}
if (threadIdx.x == 0)
g_num_inliers[blockIdx.x] = s_num_inliers[0];
}
void computeHypothesisScores(
const int num_hypotheses, const int num_points, const float* rot_matrices,
const float3* transl_vectors, const float3* object, const float2* image,
const float dist_threshold, int* hypothesis_scores)
{
cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
void computeHypothesisScores(
const int num_hypotheses, const int num_points, const float* rot_matrices,
const float3* transl_vectors, const float3* object, const float2* image,
const float dist_threshold, int* hypothesis_scores)
{
cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
dim3 threads(256);
dim3 grid(num_hypotheses);
int smem_size = threads.x * sizeof(float);
dim3 threads(256);
dim3 grid(num_hypotheses);
int smem_size = threads.x * sizeof(float);
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall( cudaGetLastError() );
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
} // namespace solvepnp_ransac
END_OPENCV_DEVICE_NAMESPACE
cudaSafeCall( cudaDeviceSynchronize() );
}
} // namespace solvepnp_ransac
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -44,450 +44,448 @@
#include <algorithm>
#include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace canny {
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
namespace cv { namespace gpu { namespace device
{
__shared__ int smem[16][18];
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i < rows)
namespace canny
{
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
if (threadIdx.x == 0)
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
{
smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
}
__syncthreads();
__shared__ int smem[16][18];
if (j < cols)
{
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
}
}
}
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
struct L1
{
static __device__ __forceinline__ float calc(int x, int y)
{
return ::abs(x) + ::abs(y);
}
};
struct L2
{
static __device__ __forceinline__ float calc(int x, int y)
{
return ::sqrtf(x * x + y * y);
}
};
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{
__shared__ int sdx[18][16];
__shared__ int sdy[18][16];
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (j < cols)
{
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
if (threadIdx.y == 0)
{
sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
}
__syncthreads();
if (i < rows)
{
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
dx.ptr(i)[j] = x;
dy.ptr(i)[j] = y;
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
}
}
}
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
if (L2Grad)
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
else
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i < rows && j < cols)
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
}
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
if (L2Grad)
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
else
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
//////////////////////////////////////////////////////////////////////////////////////////
#define CANNY_SHIFT 15
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
{
__shared__ float smem[18][18];
const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y;
const int tid = threadIdx.y * 16 + threadIdx.x;
const int lx = tid % 18;
const int ly = tid / 18;
if (ly < 14)
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
__syncthreads();
if (i < rows && j < cols)
{
int x = dx.ptr(i)[j];
int y = dy.ptr(i)[j];
const int s = (x ^ y) < 0 ? -1 : 1;
const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
x = ::abs(x);
y = ::abs(y);
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if (m > low_thresh)
{
const int tg22x = x * TG22;
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
y <<= CANNY_SHIFT;
if (y < tg22x)
if (i < rows)
{
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
edge_type = 1 + (int)(m > high_thresh);
}
else if( y > tg67x )
{
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
edge_type = 1 + (int)(m > high_thresh);
}
else
{
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
edge_type = 1 + (int)(m > high_thresh);
}
}
map.ptr(i + 1)[j + 1] = edge_type;
}
}
#undef CANNY_SHIFT
#undef TG22
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
//////////////////////////////////////////////////////////////////////////////////////////
__device__ unsigned int counter = 0;
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
{
#if __CUDA_ARCH__ >= 120
__shared__ int smem[18][18];
const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y;
const int tid = threadIdx.y * 16 + threadIdx.x;
const int lx = tid % 18;
const int ly = tid / 18;
if (ly < 14)
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
__syncthreads();
if (i < rows && j < cols)
{
int n;
#pragma unroll
for (int k = 0; k < 16; ++k)
{
n = 0;
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
{
n += smem[threadIdx.y ][threadIdx.x ] == 2;
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
}
if (n > 0)
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
}
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
map.ptr(i + 1)[j + 1] = e;
n = 0;
if (e == 2)
{
n += smem[threadIdx.y ][threadIdx.x ] == 1;
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
}
if (n > 0)
{
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
st[ind] = make_ushort2(j + 1, i + 1);
}
}
#endif
}
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
{
#if __CUDA_ARCH__ >= 120
const int stack_size = 512;
__shared__ unsigned int s_counter;
__shared__ unsigned int s_ind;
__shared__ ushort2 s_st[stack_size];
if (threadIdx.x == 0)
s_counter = 0;
__syncthreads();
int ind = blockIdx.y * gridDim.x + blockIdx.x;
if (ind < count)
{
ushort2 pos = st1[ind];
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
{
if (threadIdx.x < 8)
{
pos.x += c_dx[threadIdx.x];
pos.y += c_dy[threadIdx.x];
if (map.ptr(pos.y)[pos.x] == 1)
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
if (threadIdx.x == 0)
{
map.ptr(pos.y)[pos.x] = 2;
smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
}
__syncthreads();
ind = atomicInc(&s_counter, (unsigned int)(-1));
s_st[ind] = pos;
if (j < cols)
{
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
}
}
}
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
struct L1
{
static __device__ __forceinline__ float calc(int x, int y)
{
return ::abs(x) + ::abs(y);
}
};
struct L2
{
static __device__ __forceinline__ float calc(int x, int y)
{
return ::sqrtf(x * x + y * y);
}
};
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{
__shared__ int sdx[18][16];
__shared__ int sdy[18][16];
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (j < cols)
{
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
if (threadIdx.y == 0)
{
sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
}
__syncthreads();
if (i < rows)
{
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
dx.ptr(i)[j] = x;
dy.ptr(i)[j] = y;
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
}
}
}
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
if (L2Grad)
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
else
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i < rows && j < cols)
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
}
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
if (L2Grad)
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
else
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
//////////////////////////////////////////////////////////////////////////////////////////
#define CANNY_SHIFT 15
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
{
__shared__ float smem[18][18];
const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y;
const int tid = threadIdx.y * 16 + threadIdx.x;
const int lx = tid % 18;
const int ly = tid / 18;
if (ly < 14)
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
__syncthreads();
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
if (i < rows && j < cols)
{
const int subTaskIdx = threadIdx.x >> 3;
const int portion = ::min(s_counter, blockDim.x >> 3);
int x = dx.ptr(i)[j];
int y = dy.ptr(i)[j];
const int s = (x ^ y) < 0 ? -1 : 1;
const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
pos.x = pos.y = 0;
x = ::abs(x);
y = ::abs(y);
if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx];
__syncthreads();
if (threadIdx.x == 0)
s_counter -= portion;
__syncthreads();
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if (m > low_thresh)
{
pos.x += c_dx[threadIdx.x & 7];
pos.y += c_dy[threadIdx.x & 7];
const int tg22x = x * TG22;
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
if (map.ptr(pos.y)[pos.x] == 1)
y <<= CANNY_SHIFT;
if (y < tg22x)
{
map.ptr(pos.y)[pos.x] = 2;
ind = atomicInc(&s_counter, (unsigned int)(-1));
s_st[ind] = pos;
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
edge_type = 1 + (int)(m > high_thresh);
}
else if( y > tg67x )
{
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
edge_type = 1 + (int)(m > high_thresh);
}
else
{
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
edge_type = 1 + (int)(m > high_thresh);
}
}
__syncthreads();
}
if (s_counter > 0)
{
if (threadIdx.x == 0)
{
ind = atomicAdd(&counter, s_counter);
s_ind = ind - s_counter;
}
__syncthreads();
ind = s_ind;
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
{
st2[ind + i] = s_st[i];
}
map.ptr(i + 1)[j + 1] = edge_type;
}
}
}
#endif
}
#undef CANNY_SHIFT
#undef TG22
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
unsigned int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
while (count > 0)
{
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() );
dim3 block(128, 1, 1);
dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
cudaSafeCall(cudaThreadSynchronize());
//////////////////////////////////////////////////////////////////////////////////////////
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
__device__ unsigned int counter = 0;
std::swap(st1, st2);
}
}
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
{
#if __CUDA_ARCH__ >= 120
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
{
const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y;
__shared__ int smem[18][18];
if (i < rows && j < cols)
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
}
const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y;
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
const int tid = threadIdx.y * 16 + threadIdx.x;
const int lx = tid % 18;
const int ly = tid / 18;
getEdges<<<grid, block>>>(map, dst, rows, cols);
cudaSafeCall( cudaGetLastError() );
if (ly < 14)
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
cudaSafeCall(cudaThreadSynchronize());
}
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
} // namespace canny
__syncthreads();
END_OPENCV_DEVICE_NAMESPACE
if (i < rows && j < cols)
{
int n;
#pragma unroll
for (int k = 0; k < 16; ++k)
{
n = 0;
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
{
n += smem[threadIdx.y ][threadIdx.x ] == 2;
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
}
if (n > 0)
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
}
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
map.ptr(i + 1)[j + 1] = e;
n = 0;
if (e == 2)
{
n += smem[threadIdx.y ][threadIdx.x ] == 1;
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
}
if (n > 0)
{
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
st[ind] = make_ushort2(j + 1, i + 1);
}
}
#endif
}
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
{
#if __CUDA_ARCH__ >= 120
const int stack_size = 512;
__shared__ unsigned int s_counter;
__shared__ unsigned int s_ind;
__shared__ ushort2 s_st[stack_size];
if (threadIdx.x == 0)
s_counter = 0;
__syncthreads();
int ind = blockIdx.y * gridDim.x + blockIdx.x;
if (ind < count)
{
ushort2 pos = st1[ind];
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
{
if (threadIdx.x < 8)
{
pos.x += c_dx[threadIdx.x];
pos.y += c_dy[threadIdx.x];
if (map.ptr(pos.y)[pos.x] == 1)
{
map.ptr(pos.y)[pos.x] = 2;
ind = atomicInc(&s_counter, (unsigned int)(-1));
s_st[ind] = pos;
}
}
__syncthreads();
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
{
const int subTaskIdx = threadIdx.x >> 3;
const int portion = ::min(s_counter, blockDim.x >> 3);
pos.x = pos.y = 0;
if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx];
__syncthreads();
if (threadIdx.x == 0)
s_counter -= portion;
__syncthreads();
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
{
pos.x += c_dx[threadIdx.x & 7];
pos.y += c_dy[threadIdx.x & 7];
if (map.ptr(pos.y)[pos.x] == 1)
{
map.ptr(pos.y)[pos.x] = 2;
ind = atomicInc(&s_counter, (unsigned int)(-1));
s_st[ind] = pos;
}
}
__syncthreads();
}
if (s_counter > 0)
{
if (threadIdx.x == 0)
{
ind = atomicAdd(&counter, s_counter);
s_ind = ind - s_counter;
}
__syncthreads();
ind = s_ind;
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
{
st2[ind + i] = s_st[i];
}
}
}
}
#endif
}
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
unsigned int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
while (count > 0)
{
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
dim3 block(128, 1, 1);
dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
std::swap(st1, st2);
}
}
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
{
const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y;
if (i < rows && j < cols)
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
}
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
{
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
getEdges<<<grid, block>>>(map, dst, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
} // namespace canny
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -44,181 +44,181 @@
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/color.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace cv { namespace gpu { namespace device
{
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_x = 8 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_x = 8 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
{
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
{
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
{
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
{
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \
@@ -226,7 +226,7 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
traits::functor_type functor = traits::create_functor(); \
typedef typename traits::functor_type::argument_type src_t; \
typedef typename traits::functor_type::result_type dst_t; \
OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
::cv::gpu::device::transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
}
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
@@ -243,138 +243,137 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
END_OPENCV_DEVICE_NAMESPACE
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -47,203 +47,201 @@
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
#define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 4
#define RESULT_STEPS 8
#define HALO_STEPS 1
namespace column_filter {
__constant__ float c_kernel[MAX_KERNEL_SIZE];
void loadKernel(const float kernel[], int ksize)
namespace cv { namespace gpu { namespace device
{
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
}
#define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 4
#define RESULT_STEPS 8
#define HALO_STEPS 1
template <int KERNEL_SIZE, typename T, typename D, typename B>
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
__shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
//Offset to the upper halo edge
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
if (x < src.cols)
namespace column_filter
{
const T* src_col = src.ptr() + x;
__constant__ float c_kernel[MAX_KERNEL_SIZE];
//Main data
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
//Upper halo
#pragma unroll
for(int i = 0; i < HALO_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
//Lower halo
#pragma unroll
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
__syncthreads();
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
void loadKernel(const float kernel[], int ksize)
{
sum_t sum = VecTraits<sum_t>::all(0);
#pragma unroll
for(int j = 0; j < KERNEL_SIZE; ++j)
sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
int dstY = y + i * BLOCK_DIM_Y;
if (dstY < src.rows)
dst.ptr(dstY)[x] = saturate_cast<D>(sum);
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
}
}
}
template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
template <int KERNEL_SIZE, typename T, typename D, typename B>
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
B<T> b(src.rows);
__shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
//Offset to the upper halo edge
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
if (x < src.cols)
{
const T* src_col = src.ptr() + x;
template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[5][17] =
{
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReflect101>,
linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
linearColumnFilter_caller<3 , T, D, BrdColReflect101>,
linearColumnFilter_caller<4 , T, D, BrdColReflect101>,
linearColumnFilter_caller<5 , T, D, BrdColReflect101>,
linearColumnFilter_caller<6 , T, D, BrdColReflect101>,
linearColumnFilter_caller<7 , T, D, BrdColReflect101>,
linearColumnFilter_caller<8 , T, D, BrdColReflect101>,
linearColumnFilter_caller<9 , T, D, BrdColReflect101>,
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
linearColumnFilter_caller<16, T, D, BrdColReflect101>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
linearColumnFilter_caller<3 , T, D, BrdColReplicate>,
linearColumnFilter_caller<4 , T, D, BrdColReplicate>,
linearColumnFilter_caller<5 , T, D, BrdColReplicate>,
linearColumnFilter_caller<6 , T, D, BrdColReplicate>,
linearColumnFilter_caller<7 , T, D, BrdColReplicate>,
linearColumnFilter_caller<8 , T, D, BrdColReplicate>,
linearColumnFilter_caller<9 , T, D, BrdColReplicate>,
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
linearColumnFilter_caller<16, T, D, BrdColReplicate>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
linearColumnFilter_caller<2 , T, D, BrdColConstant>,
linearColumnFilter_caller<3 , T, D, BrdColConstant>,
linearColumnFilter_caller<4 , T, D, BrdColConstant>,
linearColumnFilter_caller<5 , T, D, BrdColConstant>,
linearColumnFilter_caller<6 , T, D, BrdColConstant>,
linearColumnFilter_caller<7 , T, D, BrdColConstant>,
linearColumnFilter_caller<8 , T, D, BrdColConstant>,
linearColumnFilter_caller<9 , T, D, BrdColConstant>,
linearColumnFilter_caller<10, T, D, BrdColConstant>,
linearColumnFilter_caller<11, T, D, BrdColConstant>,
linearColumnFilter_caller<12, T, D, BrdColConstant>,
linearColumnFilter_caller<13, T, D, BrdColConstant>,
linearColumnFilter_caller<14, T, D, BrdColConstant>,
linearColumnFilter_caller<15, T, D, BrdColConstant>,
linearColumnFilter_caller<16, T, D, BrdColConstant>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReflect>,
linearColumnFilter_caller<2 , T, D, BrdColReflect>,
linearColumnFilter_caller<3 , T, D, BrdColReflect>,
linearColumnFilter_caller<4 , T, D, BrdColReflect>,
linearColumnFilter_caller<5 , T, D, BrdColReflect>,
linearColumnFilter_caller<6 , T, D, BrdColReflect>,
linearColumnFilter_caller<7 , T, D, BrdColReflect>,
linearColumnFilter_caller<8 , T, D, BrdColReflect>,
linearColumnFilter_caller<9 , T, D, BrdColReflect>,
linearColumnFilter_caller<10, T, D, BrdColReflect>,
linearColumnFilter_caller<11, T, D, BrdColReflect>,
linearColumnFilter_caller<12, T, D, BrdColReflect>,
linearColumnFilter_caller<13, T, D, BrdColReflect>,
linearColumnFilter_caller<14, T, D, BrdColReflect>,
linearColumnFilter_caller<15, T, D, BrdColReflect>,
linearColumnFilter_caller<16, T, D, BrdColReflect>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColWrap>,
linearColumnFilter_caller<2 , T, D, BrdColWrap>,
linearColumnFilter_caller<3 , T, D, BrdColWrap>,
linearColumnFilter_caller<4 , T, D, BrdColWrap>,
linearColumnFilter_caller<5 , T, D, BrdColWrap>,
linearColumnFilter_caller<6 , T, D, BrdColWrap>,
linearColumnFilter_caller<7 , T, D, BrdColWrap>,
linearColumnFilter_caller<8 , T, D, BrdColWrap>,
linearColumnFilter_caller<9 , T, D, BrdColWrap>,
linearColumnFilter_caller<10, T, D, BrdColWrap>,
linearColumnFilter_caller<11, T, D, BrdColWrap>,
linearColumnFilter_caller<12, T, D, BrdColWrap>,
linearColumnFilter_caller<13, T, D, BrdColWrap>,
linearColumnFilter_caller<14, T, D, BrdColWrap>,
linearColumnFilter_caller<15, T, D, BrdColWrap>,
linearColumnFilter_caller<16, T, D, BrdColWrap>,
//Main data
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
//Upper halo
#pragma unroll
for(int i = 0; i < HALO_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
//Lower halo
#pragma unroll
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
__syncthreads();
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
{
sum_t sum = VecTraits<sum_t>::all(0);
#pragma unroll
for(int j = 0; j < KERNEL_SIZE; ++j)
sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
int dstY = y + i * BLOCK_DIM_Y;
if (dstY < src.rows)
dst.ptr(dstY)[x] = saturate_cast<D>(sum);
}
}
}
};
loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , int >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
B<T> b(src.rows);
} // namespace column_filter
linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
END_OPENCV_DEVICE_NAMESPACE
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[5][17] =
{
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReflect101>,
linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
linearColumnFilter_caller<3 , T, D, BrdColReflect101>,
linearColumnFilter_caller<4 , T, D, BrdColReflect101>,
linearColumnFilter_caller<5 , T, D, BrdColReflect101>,
linearColumnFilter_caller<6 , T, D, BrdColReflect101>,
linearColumnFilter_caller<7 , T, D, BrdColReflect101>,
linearColumnFilter_caller<8 , T, D, BrdColReflect101>,
linearColumnFilter_caller<9 , T, D, BrdColReflect101>,
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
linearColumnFilter_caller<16, T, D, BrdColReflect101>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
linearColumnFilter_caller<3 , T, D, BrdColReplicate>,
linearColumnFilter_caller<4 , T, D, BrdColReplicate>,
linearColumnFilter_caller<5 , T, D, BrdColReplicate>,
linearColumnFilter_caller<6 , T, D, BrdColReplicate>,
linearColumnFilter_caller<7 , T, D, BrdColReplicate>,
linearColumnFilter_caller<8 , T, D, BrdColReplicate>,
linearColumnFilter_caller<9 , T, D, BrdColReplicate>,
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
linearColumnFilter_caller<16, T, D, BrdColReplicate>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
linearColumnFilter_caller<2 , T, D, BrdColConstant>,
linearColumnFilter_caller<3 , T, D, BrdColConstant>,
linearColumnFilter_caller<4 , T, D, BrdColConstant>,
linearColumnFilter_caller<5 , T, D, BrdColConstant>,
linearColumnFilter_caller<6 , T, D, BrdColConstant>,
linearColumnFilter_caller<7 , T, D, BrdColConstant>,
linearColumnFilter_caller<8 , T, D, BrdColConstant>,
linearColumnFilter_caller<9 , T, D, BrdColConstant>,
linearColumnFilter_caller<10, T, D, BrdColConstant>,
linearColumnFilter_caller<11, T, D, BrdColConstant>,
linearColumnFilter_caller<12, T, D, BrdColConstant>,
linearColumnFilter_caller<13, T, D, BrdColConstant>,
linearColumnFilter_caller<14, T, D, BrdColConstant>,
linearColumnFilter_caller<15, T, D, BrdColConstant>,
linearColumnFilter_caller<16, T, D, BrdColConstant>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReflect>,
linearColumnFilter_caller<2 , T, D, BrdColReflect>,
linearColumnFilter_caller<3 , T, D, BrdColReflect>,
linearColumnFilter_caller<4 , T, D, BrdColReflect>,
linearColumnFilter_caller<5 , T, D, BrdColReflect>,
linearColumnFilter_caller<6 , T, D, BrdColReflect>,
linearColumnFilter_caller<7 , T, D, BrdColReflect>,
linearColumnFilter_caller<8 , T, D, BrdColReflect>,
linearColumnFilter_caller<9 , T, D, BrdColReflect>,
linearColumnFilter_caller<10, T, D, BrdColReflect>,
linearColumnFilter_caller<11, T, D, BrdColReflect>,
linearColumnFilter_caller<12, T, D, BrdColReflect>,
linearColumnFilter_caller<13, T, D, BrdColReflect>,
linearColumnFilter_caller<14, T, D, BrdColReflect>,
linearColumnFilter_caller<15, T, D, BrdColReflect>,
linearColumnFilter_caller<16, T, D, BrdColReflect>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColWrap>,
linearColumnFilter_caller<2 , T, D, BrdColWrap>,
linearColumnFilter_caller<3 , T, D, BrdColWrap>,
linearColumnFilter_caller<4 , T, D, BrdColWrap>,
linearColumnFilter_caller<5 , T, D, BrdColWrap>,
linearColumnFilter_caller<6 , T, D, BrdColWrap>,
linearColumnFilter_caller<7 , T, D, BrdColWrap>,
linearColumnFilter_caller<8 , T, D, BrdColWrap>,
linearColumnFilter_caller<9 , T, D, BrdColWrap>,
linearColumnFilter_caller<10, T, D, BrdColWrap>,
linearColumnFilter_caller<11, T, D, BrdColWrap>,
linearColumnFilter_caller<12, T, D, BrdColWrap>,
linearColumnFilter_caller<13, T, D, BrdColWrap>,
linearColumnFilter_caller<14, T, D, BrdColWrap>,
linearColumnFilter_caller<15, T, D, BrdColWrap>,
linearColumnFilter_caller<16, T, D, BrdColWrap>,
}
};
loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , int >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
} // namespace column_filter
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -43,87 +43,85 @@
#include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc {
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
namespace cv { namespace gpu { namespace device
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = src(y - top, x - left);
}
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
{
static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,
const T* borderValue, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type vec_type;
typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
static const caller_t callers[5] =
namespace imgproc
{
CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
};
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
}
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = src(y - top, x - left);
}
template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
{
static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
cudaSafeCall( cudaGetLastError() );
template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,
const T* borderValue, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type vec_type;
template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
} // namespace imgproc
static const caller_t callers[5] =
{
CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
};
END_OPENCV_DEVICE_NAMESPACE
callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
}
template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device

File diff suppressed because it is too large Load Diff

View File

@@ -45,177 +45,175 @@
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace cv { namespace gpu { namespace device
{
#define UINT_BITS 32U
#define UINT_BITS 32U
//Warps == subhistograms per threadblock
#define WARP_COUNT 6
//Warps == subhistograms per threadblock
#define WARP_COUNT 6
//Threadblock size
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
#define HISTOGRAM256_BIN_COUNT 256
//Threadblock size
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
#define HISTOGRAM256_BIN_COUNT 256
//Shared memory per threadblock
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
//Shared memory per threadblock
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
#define PARTIAL_HISTOGRAM256_COUNT 240
#define PARTIAL_HISTOGRAM256_COUNT 240
#define MERGE_THREADBLOCK_SIZE 256
#define MERGE_THREADBLOCK_SIZE 256
#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
namespace hist {
#if (!USE_SMEM_ATOMICS)
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
namespace hist
{
uint count;
do
#if (!USE_SMEM_ATOMICS)
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
{
uint count;
do
{
count = s_WarpHist[data] & TAG_MASK;
count = threadTag | (count + 1);
s_WarpHist[data] = count;
} while (s_WarpHist[data] != count);
}
#else
#define TAG_MASK 0xFFFFFFFFU
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
{
atomicAdd(s_WarpHist + data, 1);
}
#endif
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
{
count = s_WarpHist[data] & TAG_MASK;
count = threadTag | (count + 1);
s_WarpHist[data] = count;
} while (s_WarpHist[data] != count);
}
uint x = pos_x << 2;
#else
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
}
#define TAG_MASK 0xFFFFFFFFU
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
{
//Per-warp subhistogram storage
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
{
atomicAdd(s_WarpHist + data, 1);
}
//Clear shared memory storage for current threadblock before processing
#pragma unroll
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
#endif
//Cycle through the entire data set, update subhistograms for each warp
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
{
uint x = pos_x << 2;
__syncthreads();
const uint colsui = d_Data.step / sizeof(uint);
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
{
uint pos_y = pos / colsui;
uint pos_x = pos % colsui;
uint data = d_Data.ptr(pos_y)[pos_x];
addWord(s_WarpHist, data, tag, pos_x, cols);
}
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
}
//Merge per-warp histograms into per-block and write to global memory
__syncthreads();
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
{
uint sum = 0;
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
{
//Per-warp subhistogram storage
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
for (uint i = 0; i < WARP_COUNT; i++)
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
//Clear shared memory storage for current threadblock before processing
#pragma unroll
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
}
}
//Cycle through the entire data set, update subhistograms for each warp
const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
////////////////////////////////////////////////////////////////////////////////
// Merge histogram256() output
// Run one threadblock per bin; each threadblock adds up the same bin counter
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
// takes only a fraction of total processing time
////////////////////////////////////////////////////////////////////////////////
__syncthreads();
const uint colsui = d_Data.step / sizeof(uint);
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
{
uint pos_y = pos / colsui;
uint pos_x = pos % colsui;
uint data = d_Data.ptr(pos_y)[pos_x];
addWord(s_WarpHist, data, tag, pos_x, cols);
}
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
{
uint sum = 0;
//Merge per-warp histograms into per-block and write to global memory
__syncthreads();
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
{
uint sum = 0;
#pragma unroll
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
for (uint i = 0; i < WARP_COUNT; i++)
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
data[threadIdx.x] = sum;
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
}
}
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
{
__syncthreads();
if(threadIdx.x < stride)
data[threadIdx.x] += data[threadIdx.x + stride];
}
////////////////////////////////////////////////////////////////////////////////
// Merge histogram256() output
// Run one threadblock per bin; each threadblock adds up the same bin counter
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
// takes only a fraction of total processing time
////////////////////////////////////////////////////////////////////////////////
if(threadIdx.x == 0)
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
}
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
{
uint sum = 0;
void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
{
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
DevMem2D_<uint>(src),
buf,
static_cast<uint>(src.rows * src.step / sizeof(uint)),
src.cols);
#pragma unroll
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
cudaSafeCall( cudaGetLastError() );
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
data[threadIdx.x] = sum;
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
{
__syncthreads();
if(threadIdx.x < stride)
data[threadIdx.x] += data[threadIdx.x + stride];
}
cudaSafeCall( cudaGetLastError() );
if(threadIdx.x == 0)
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
{
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
DevMem2D_<uint>(src),
buf,
static_cast<uint>(src.rows * src.step / sizeof(uint)),
src.cols);
__constant__ int c_lut[256];
cudaSafeCall( cudaGetLastError() );
__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
if (x < src.cols && y < src.rows)
{
const uchar val = src.ptr(y)[x];
const int lut = c_lut[val];
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
}
}
cudaSafeCall( cudaGetLastError() );
void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
{
dim3 block(16, 16);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
__constant__ int c_lut[256];
equalizeHist<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < src.cols && y < src.rows)
{
const uchar val = src.ptr(y)[x];
const int lut = c_lut[val];
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
}
}
void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
{
dim3 block(16, 16);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
equalizeHist<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
} // namespace hist
END_OPENCV_DEVICE_NAMESPACE
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
} // namespace hist
}}} // namespace cv { namespace gpu { namespace device

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -50,7 +50,7 @@
#include "safe_call.hpp"
#ifndef CV_PI
#define CV_PI 3.1415926535897932384626433832795f
#define CV_PI 3.1415926535897932384626433832795
#endif
#ifndef CV_PI_F
@@ -61,27 +61,21 @@
#endif
#endif
#define BEGIN_OPENCV_DEVICE_NAMESPACE namespace cv { namespace gpu { namespace device {
#define END_OPENCV_DEVICE_NAMESPACE }}}
#define OPENCV_DEVICE_NAMESPACE ::cv::gpu::device
#define OPENCV_DEVICE_NAMESPACE_ ::cv::gpu::device::
#ifdef __CUDACC__
BEGIN_OPENCV_DEVICE_NAMESPACE
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef signed char schar;
typedef unsigned int uint;
template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
namespace cv { namespace gpu { namespace device
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
}
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef signed char schar;
typedef unsigned int uint;
END_OPENCV_DEVICE_NAMESPACE
template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
}
}}}
#endif
@@ -102,87 +96,6 @@ namespace cv { namespace gpu
static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
/*template<class T> static inline void uploadConstant(const char* name, const T& value)
{
cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) );
}
template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream)
{
cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) );
} */
//template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img)
//{
// //!!!! const_cast is disabled!
// //!!!! Please use constructor of 'class texture' instead.
//
// //textureReference* tex;
// //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) );
// //tex->normalized = normalized;
// //tex->filterMode = filterMode;
// //tex->addressMode[0] = addrMode;
// //tex->addressMode[1] = addrMode;
//
// const textureReference* tex;
// cudaSafeCall( cudaGetTextureReference(&tex, name) );
//
// cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
// cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
//}
//static inline void unbindTexture(const char *name)
//{
// const textureReference* tex;
// cudaSafeCall( cudaGetTextureReference(&tex, name) );
// cudaSafeCall( cudaUnbindTexture(tex) );
//}
//class TextureBinder
//{
//public:
// TextureBinder() : tex_(0) {}
// template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
// {
// bind(tex, img);
// }
// template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
// {
// bind(tex_name, img);
// }
// ~TextureBinder() { unbind(); }
//
// template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
// {
// unbind();
//
// cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
// cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
//
// tex_ = tex;
// }
// template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
// {
// const textureReference* tex;
// cudaSafeCall( cudaGetTextureReference(&tex, tex_name) );
// bind(tex, img);
// }
//
// void unbind()
// {
// if (tex_)
// {
// cudaUnbindTexture(tex_);
// tex_ = 0;
// }
// }
//
//private:
// const textureReference* tex_;
//};
class NppStreamHandler
{
public:

File diff suppressed because it is too large Load Diff

View File

@@ -42,174 +42,172 @@
#include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace mathfunc {
//////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar
struct Nothing
namespace cv { namespace gpu { namespace device
{
static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
namespace mathfunc
{
}
};
struct Magnitude
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
}
};
struct MagnitudeSqr
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
}
};
struct Atan2
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
{
float angle = ::atan2f(y_data, x_data);
angle += (angle < 0) * 2.0 * CV_PI;
dst[y * dst_step + x] = scale * angle;
}
};
template <typename Mag, typename Angle>
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
//////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar
if (x < width && y < height)
{
float x_data = xptr[y * x_step + x];
float y_data = yptr[y * y_step + x];
Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
}
}
struct NonEmptyMag
{
static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
{
return mag[y * mag_step + x];
}
};
struct EmptyMag
{
static __device__ __forceinline__ float get(const float*, size_t, int, int)
{
return 1.0f;
}
};
template <typename Mag>
__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
float mag_data = Mag::get(mag, mag_step, x, y);
float angle_data = angle[y * angle_step + x];
float sin_a, cos_a;
::sincosf(scale * angle_data, &sin_a, &cos_a);
xptr[y * x_step + x] = mag_data * cos_a;
yptr[y * y_step + x] = mag_data * sin_a;
}
}
template <typename Mag, typename Angle>
void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(x.cols, threads.x);
grid.y = divUp(x.rows, threads.y);
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2][2][2] =
{
struct Nothing
{
static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
{
cartToPolar_caller<Magnitude, Atan2>,
cartToPolar_caller<Magnitude, Nothing>
},
{
cartToPolar_caller<MagnitudeSqr, Atan2>,
cartToPolar_caller<MagnitudeSqr, Nothing>,
}
},
};
struct Magnitude
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>
},
dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
}
};
struct MagnitudeSqr
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>,
dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
}
};
struct Atan2
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
{
float angle = ::atan2f(y_data, x_data);
angle += (angle < 0) * 2.0 * CV_PI;
dst[y * dst_step + x] = scale * angle;
}
};
template <typename Mag, typename Angle>
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
float x_data = xptr[y * x_step + x];
float y_data = yptr[y * y_step + x];
Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
}
}
};
callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
}
struct NonEmptyMag
{
static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
{
return mag[y * mag_step + x];
}
};
struct EmptyMag
{
static __device__ __forceinline__ float get(const float*, size_t, int, int)
{
return 1.0f;
}
};
template <typename Mag>
__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
template <typename Mag>
void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
if (x < width && y < height)
{
float mag_data = Mag::get(mag, mag_step, x, y);
float angle_data = angle[y * angle_step + x];
float sin_a, cos_a;
grid.x = divUp(mag.cols, threads.x);
grid.y = divUp(mag.rows, threads.y);
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
::sincosf(scale * angle_data, &sin_a, &cos_a);
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
cudaSafeCall( cudaGetLastError() );
xptr[y * x_step + x] = mag_data * cos_a;
yptr[y * y_step + x] = mag_data * sin_a;
}
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename Mag, typename Angle>
void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] =
{
polarToCart_caller<NonEmptyMag>,
polarToCart_caller<EmptyMag>
};
grid.x = divUp(x.cols, threads.x);
grid.y = divUp(x.rows, threads.y);
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
}
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
cudaSafeCall( cudaGetLastError() );
} // namespace mathfunc
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
END_OPENCV_DEVICE_NAMESPACE
void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2][2][2] =
{
{
{
cartToPolar_caller<Magnitude, Atan2>,
cartToPolar_caller<Magnitude, Nothing>
},
{
cartToPolar_caller<MagnitudeSqr, Atan2>,
cartToPolar_caller<MagnitudeSqr, Nothing>,
}
},
{
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>
},
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>,
}
}
};
callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
}
template <typename Mag>
void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(mag.cols, threads.x);
grid.y = divUp(mag.rows, threads.y);
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] =
{
polarToCart_caller<NonEmptyMag>,
polarToCart_caller<EmptyMag>
};
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
}
} // namespace mathfunc
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -45,304 +45,303 @@
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
template <typename T> struct shift_and_sizeof;
template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo /////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template<typename T>
__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
namespace cv { namespace gpu { namespace device
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
template <typename T> struct shift_and_sizeof;
template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
if ((x < cols * channels ) && (y < rows))
if (mask[y * step_mask + x / channels] != 0)
{
size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
mat_dst[idx] = mat_src[idx];
}
}
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo /////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template<typename T>
void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaDeviceSynchronize() );
}
void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
{
typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
static CopyToFunc tab[8] =
template<typename T>
__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
{
copy_to_with_mask_run<unsigned char>,
copy_to_with_mask_run<signed char>,
copy_to_with_mask_run<unsigned short>,
copy_to_with_mask_run<short>,
copy_to_with_mask_run<int>,
copy_to_with_mask_run<float>,
copy_to_with_mask_run<double>,
0
};
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
CopyToFunc func = tab[depth];
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
func(mat_src, mat_dst, mask, channels, stream);
}
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// SetTo //////////////////////////////////
///////////////////////////////////////////////////////////////////////////
__constant__ uchar scalar_8u[4];
__constant__ schar scalar_8s[4];
__constant__ ushort scalar_16u[4];
__constant__ short scalar_16s[4];
__constant__ int scalar_32s[4];
__constant__ float scalar_32f[4];
__constant__ double scalar_64f[4];
template <typename T> __device__ __forceinline__ T readScalar(int i);
template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
void writeScalar(const uchar* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
}
void writeScalar(const schar* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
}
void writeScalar(const ushort* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
}
void writeScalar(const short* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
}
void writeScalar(const int* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
}
void writeScalar(const float* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
}
void writeScalar(const double* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
}
template<typename T>
__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < cols * channels ) && (y < rows))
{
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
mat[idx] = readScalar<T>(x % channels);
if ((x < cols * channels ) && (y < rows))
if (mask[y * step_mask + x / channels] != 0)
{
size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
mat_dst[idx] = mat_src[idx];
}
}
}
template<typename T>
__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
template<typename T>
void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
if ((x < cols * channels ) && (y < rows))
if (mask[y * step_mask + x / channels] != 0)
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaDeviceSynchronize() );
}
void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
{
typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
static CopyToFunc tab[8] =
{
copy_to_with_mask_run<unsigned char>,
copy_to_with_mask_run<signed char>,
copy_to_with_mask_run<unsigned short>,
copy_to_with_mask_run<short>,
copy_to_with_mask_run<int>,
copy_to_with_mask_run<float>,
copy_to_with_mask_run<double>,
0
};
CopyToFunc func = tab[depth];
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
func(mat_src, mat_dst, mask, channels, stream);
}
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// SetTo //////////////////////////////////
///////////////////////////////////////////////////////////////////////////
__constant__ uchar scalar_8u[4];
__constant__ schar scalar_8s[4];
__constant__ ushort scalar_16u[4];
__constant__ short scalar_16s[4];
__constant__ int scalar_32s[4];
__constant__ float scalar_32f[4];
__constant__ double scalar_64f[4];
template <typename T> __device__ __forceinline__ T readScalar(int i);
template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
void writeScalar(const uchar* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
}
void writeScalar(const schar* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
}
void writeScalar(const ushort* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
}
void writeScalar(const short* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
}
void writeScalar(const int* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
}
void writeScalar(const float* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
}
void writeScalar(const double* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
}
template<typename T>
__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < cols * channels ) && (y < rows))
{
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
mat[idx] = readScalar<T>(x % channels);
}
}
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
{
writeScalar(scalar);
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaDeviceSynchronize() );
}
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
{
writeScalar(scalar);
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaDeviceSynchronize() );
}
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template <typename T, typename D> struct Convertor : unary_function<T, D>
{
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
__device__ __forceinline__ D operator()(const T& src) const
{
return saturate_cast<D>(alpha * src + beta);
}
const double alpha, beta;
};
template<typename T>
__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
namespace detail
{
template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
if ((x < cols * channels ) && (y < rows))
if (mask[y * step_mask + x / channels] != 0)
{
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
mat[idx] = readScalar<T>(x % channels);
}
}
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
{
};
template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
writeScalar(scalar);
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaDeviceSynchronize() );
}
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
{
enum { smart_shift = 8 };
};
template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
writeScalar(scalar);
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaDeviceSynchronize() );
}
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template <typename T, typename D> struct Convertor : unary_function<T, D>
{
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
__device__ __forceinline__ D operator()(const T& src) const
{
return saturate_cast<D>(alpha * src + beta);
}
const double alpha, beta;
};
template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
namespace detail
{
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 2 };
};
template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
{
};
template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 8 };
};
template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 2 };
};
template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_shift = 2 };
};
template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 2 };
};
template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
{
};
}
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
{
};
}
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
{
};
template<typename T, typename D>
void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
{
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
Convertor<T, D> op(alpha, beta);
OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
}
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta,
cudaStream_t stream = 0)
{
typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta,
cudaStream_t stream);
static const caller_t tab[8][8] =
template<typename T, typename D>
void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
{
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
Convertor<T, D> op(alpha, beta);
::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
}
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta,
cudaStream_t stream = 0)
{
typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta,
cudaStream_t stream);
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
static const caller_t tab[8][8] =
{
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
{0,0,0,0,0,0,0,0}
};
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
caller_t func = tab[sdepth][ddepth];
if (!func)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
func(src, dst, alpha, beta, stream);
}
{0,0,0,0,0,0,0,0}
};
END_OPENCV_DEVICE_NAMESPACE
caller_t func = tab[sdepth][ddepth];
if (!func)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(src, dst, alpha, beta, stream);
}
}}} // namespace cv { namespace gpu { namespace device

File diff suppressed because it is too large Load Diff

View File

@@ -46,142 +46,140 @@
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc {
template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
namespace cv { namespace gpu { namespace device
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y;
__shared__ value_type smem[256 + 4];
value_type sum;
const int src_y = 2*y;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
smem[2 + threadIdx.x] = sum;
if (threadIdx.x < 2)
namespace imgproc
{
const int left_x = x - 2 + threadIdx.x;
template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y;
smem[threadIdx.x] = sum;
}
__shared__ value_type smem[256 + 4];
if (threadIdx.x > 253)
{
const int right_x = x + threadIdx.x + 2;
value_type sum;
const int src_y = 2*y;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
smem[4 + threadIdx.x] = sum;
}
smem[2 + threadIdx.x] = sum;
__syncthreads();
if (threadIdx.x < 2)
{
const int left_x = x - 2 + threadIdx.x;
if (threadIdx.x < 128)
{
const int tid2 = threadIdx.x * 2;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
sum = VecTraits<value_type>::all(0);
smem[threadIdx.x] = sum;
}
sum = sum + 0.0625f * smem[2 + tid2 - 2];
sum = sum + 0.25f * smem[2 + tid2 - 1];
sum = sum + 0.375f * smem[2 + tid2 ];
sum = sum + 0.25f * smem[2 + tid2 + 1];
sum = sum + 0.0625f * smem[2 + tid2 + 2];
if (threadIdx.x > 253)
{
const int right_x = x + threadIdx.x + 2;
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
if (dst_x < dst_cols)
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
}
}
smem[4 + threadIdx.x] = sum;
}
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(256);
const dim3 grid(divUp(src.cols, block.x), dst.rows);
__syncthreads();
B<T> b(src.rows, src.cols);
if (threadIdx.x < 128)
{
const int tid2 = threadIdx.x * 2;
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
cudaSafeCall( cudaGetLastError() );
sum = VecTraits<value_type>::all(0);
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
sum = sum + 0.0625f * smem[2 + tid2 - 2];
sum = sum + 0.25f * smem[2 + tid2 - 1];
sum = sum + 0.375f * smem[2 + tid2 ];
sum = sum + 0.25f * smem[2 + tid2 + 1];
sum = sum + 0.0625f * smem[2 + tid2 + 2];
template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
if (dst_x < dst_cols)
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
}
}
static const caller_t callers[] =
{
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
};
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(256);
const dim3 grid(divUp(src.cols, block.x), dst.rows);
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
B<T> b(src.rows, src.cols);
template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
cudaSafeCall( cudaGetLastError() );
template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
static const caller_t callers[] =
{
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
};
template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
} // namespace imgproc
template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
END_OPENCV_DEVICE_NAMESPACE
template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -46,137 +46,135 @@
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc {
template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
namespace cv { namespace gpu { namespace device
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ T smem1[10][10];
__shared__ value_type smem2[20][16];
value_type sum;
if (threadIdx.x < 10 && threadIdx.y < 10)
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
__syncthreads();
const int tidx = threadIdx.x;
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
smem2[2 + threadIdx.y][tidx] = sum;
if (threadIdx.y < 2)
namespace imgproc
{
sum = VecTraits<value_type>::all(0);
template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
smem2[threadIdx.y][tidx] = sum;
}
__shared__ T smem1[10][10];
__shared__ value_type smem2[20][16];
if (threadIdx.y > 13)
{
sum = VecTraits<value_type>::all(0);
value_type sum;
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
if (threadIdx.x < 10 && threadIdx.y < 10)
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
smem2[4 + threadIdx.y][tidx] = sum;
}
__syncthreads();
__syncthreads();
const int tidx = threadIdx.x;
sum = VecTraits<value_type>::all(0);
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
}
smem2[2 + threadIdx.y][tidx] = sum;
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
if (threadIdx.y < 2)
{
sum = VecTraits<value_type>::all(0);
B<T> b(src.rows, src.cols);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
cudaSafeCall( cudaGetLastError() );
smem2[threadIdx.y][tidx] = sum;
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
if (threadIdx.y > 13)
{
sum = VecTraits<value_type>::all(0);
template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
smem2[4 + threadIdx.y][tidx] = sum;
}
static const caller_t callers[] =
{
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
};
__syncthreads();
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
sum = VecTraits<value_type>::all(0);
template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
}
template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
B<T> b(src.rows, src.cols);
template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
cudaSafeCall( cudaGetLastError() );
template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
} // namespace imgproc
template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
END_OPENCV_DEVICE_NAMESPACE
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
static const caller_t callers[] =
{
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
};
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -47,208 +47,206 @@
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc {
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
namespace cv { namespace gpu { namespace device
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
namespace imgproc
{
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = mapx.ptr(y)[x];
const float ycoo = mapy.ptr(y)[x];
if (x < dst.cols && y < dst.rows)
{
const float xcoo = mapx.ptr(y)[x];
const float ycoo = mapy.ptr(y)[x];
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
const float* borderValue, cudaStream_t stream, int)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_remap_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_remap_ ## type , x, y); \
} \
}; \
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
{ \
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc >= 20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
{ \
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
const float* borderValue, cudaStream_t stream, int cc)
{
if (stream == 0)
RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
else
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
}
};
template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst,
const float* borderValue, cudaStream_t stream, int cc);
static const caller_t callers[3][5] =
{
{
RemapDispatcher<PointFilter, BrdReflect101, T>::call,
RemapDispatcher<PointFilter, BrdReplicate, T>::call,
RemapDispatcher<PointFilter, BrdConstant, T>::call,
RemapDispatcher<PointFilter, BrdReflect, T>::call,
RemapDispatcher<PointFilter, BrdWrap, T>::call
},
{
RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
RemapDispatcher<LinearFilter, BrdConstant, T>::call,
RemapDispatcher<LinearFilter, BrdReflect, T>::call,
RemapDispatcher<LinearFilter, BrdWrap, T>::call
},
{
RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
RemapDispatcher<CubicFilter, BrdConstant, T>::call,
RemapDispatcher<CubicFilter, BrdReflect, T>::call,
RemapDispatcher<CubicFilter, BrdWrap, T>::call
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
};
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
}
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
const float* borderValue, cudaStream_t stream, int)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
cudaSafeCall( cudaDeviceSynchronize() );
}
};
} // namespace imgproc
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_remap_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_remap_ ## type , x, y); \
} \
}; \
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
{ \
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc >= 20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
{ \
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
END_OPENCV_DEVICE_NAMESPACE
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
const float* borderValue, cudaStream_t stream, int cc)
{
if (stream == 0)
RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
else
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
}
};
template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst,
const float* borderValue, cudaStream_t stream, int cc);
static const caller_t callers[3][5] =
{
{
RemapDispatcher<PointFilter, BrdReflect101, T>::call,
RemapDispatcher<PointFilter, BrdReplicate, T>::call,
RemapDispatcher<PointFilter, BrdConstant, T>::call,
RemapDispatcher<PointFilter, BrdReflect, T>::call,
RemapDispatcher<PointFilter, BrdWrap, T>::call
},
{
RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
RemapDispatcher<LinearFilter, BrdConstant, T>::call,
RemapDispatcher<LinearFilter, BrdReflect, T>::call,
RemapDispatcher<LinearFilter, BrdWrap, T>::call
},
{
RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
RemapDispatcher<CubicFilter, BrdConstant, T>::call,
RemapDispatcher<CubicFilter, BrdReflect, T>::call,
RemapDispatcher<CubicFilter, BrdWrap, T>::call
}
};
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
}
template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -47,219 +47,217 @@
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc {
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
namespace cv { namespace gpu { namespace device
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
namespace imgproc
{
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = x / fx;
const float ycoo = y / fy;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = x / fx;
const float ycoo = y / fy;
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = x / fx;
const float ycoo = y / fy;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = x / fx;
const float ycoo = y / fy;
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
}
}
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
}
}
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <typename T> struct ResizeDispatcherStream<PointFilter, T>
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <typename T> struct ResizeDispatcherStream<PointFilter, T>
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
resize<<<grid, block>>>(filter_src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
resize<<<grid, block>>>(filter_src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_resize_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_resize_ ## type , x, y); \
} \
}; \
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
{ \
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
resize<<<grid, block>>>(filter_src, fx, fy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <> struct ResizeDispatcherNonStream<PointFilter, type> \
{ \
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_resize_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_resize_ ## type , x, y); \
} \
}; \
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
{ \
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
resize<<<grid, block>>>(filter_src, fx, fy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <> struct ResizeDispatcherNonStream<PointFilter, type> \
{ \
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
template <template <typename> class Filter, typename T> struct ResizeDispatcher
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
if (stream == 0)
ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
else
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
}
};
template <template <typename> class Filter, typename T> struct ResizeDispatcher
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
if (stream == 0)
ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
else
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
}
};
template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
static const caller_t callers[3] =
{
ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
};
static const caller_t callers[3] =
{
ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
};
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
}
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
}
template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
} // namespace imgproc
END_OPENCV_DEVICE_NAMESPACE
template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -47,226 +47,224 @@
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
#define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 4
#define RESULT_STEPS 8
#define HALO_STEPS 1
namespace row_filter {
__constant__ float c_kernel[MAX_KERNEL_SIZE];
void loadKernel(const float kernel[], int ksize)
namespace cv { namespace gpu { namespace device
{
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
}
#define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 4
#define RESULT_STEPS 8
#define HALO_STEPS 1
namespace detail
{
template <typename T, size_t size> struct SmemType
namespace row_filter
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
};
__constant__ float c_kernel[MAX_KERNEL_SIZE];
template <typename T> struct SmemType<T, 4>
{
typedef T smem_t;
};
}
template <typename T> struct SmemType
{
typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
};
template <int KERNEL_SIZE, typename T, typename D, typename B>
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
{
typedef typename SmemType<T>::smem_t smem_t;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
__shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
//Offset to the left halo edge
const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (y < src.rows)
{
const T* src_row = src.ptr(y);
//Load main data
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
//Load left halo
#pragma unroll
for(int i = 0; i < HALO_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
//Load right halo
#pragma unroll
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
__syncthreads();
D* dst_row = dst.ptr(y);
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
void loadKernel(const float kernel[], int ksize)
{
sum_t sum = VecTraits<sum_t>::all(0);
#pragma unroll
for (int j = 0; j < KERNEL_SIZE; ++j)
sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
int dstX = x + i * BLOCK_DIM_X;
if (dstX < src.cols)
dst_row[dstX] = saturate_cast<D>(sum);
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
}
}
}
template <int ksize, typename T, typename D, template<typename> class B>
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
typedef typename SmemType<T>::smem_t smem_t;
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
B<smem_t> b(src.cols);
linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename D>
void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[5][17] =
{
namespace detail
{
0,
linearRowFilter_caller<1 , T, D, BrdRowReflect101>,
linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
linearRowFilter_caller<3 , T, D, BrdRowReflect101>,
linearRowFilter_caller<4 , T, D, BrdRowReflect101>,
linearRowFilter_caller<5 , T, D, BrdRowReflect101>,
linearRowFilter_caller<6 , T, D, BrdRowReflect101>,
linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
linearRowFilter_caller<9 , T, D, BrdRowReflect101>,
linearRowFilter_caller<10, T, D, BrdRowReflect101>,
linearRowFilter_caller<11, T, D, BrdRowReflect101>,
linearRowFilter_caller<12, T, D, BrdRowReflect101>,
linearRowFilter_caller<13, T, D, BrdRowReflect101>,
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
linearRowFilter_caller<16, T, D, BrdRowReflect101>
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowReplicate>,
linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
linearRowFilter_caller<3 , T, D, BrdRowReplicate>,
linearRowFilter_caller<4 , T, D, BrdRowReplicate>,
linearRowFilter_caller<5 , T, D, BrdRowReplicate>,
linearRowFilter_caller<6 , T, D, BrdRowReplicate>,
linearRowFilter_caller<7 , T, D, BrdRowReplicate>,
linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
linearRowFilter_caller<9 , T, D, BrdRowReplicate>,
linearRowFilter_caller<10, T, D, BrdRowReplicate>,
linearRowFilter_caller<11, T, D, BrdRowReplicate>,
linearRowFilter_caller<12, T, D, BrdRowReplicate>,
linearRowFilter_caller<13, T, D, BrdRowReplicate>,
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
linearRowFilter_caller<16, T, D, BrdRowReplicate>
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowConstant>,
linearRowFilter_caller<2 , T, D, BrdRowConstant>,
linearRowFilter_caller<3 , T, D, BrdRowConstant>,
linearRowFilter_caller<4 , T, D, BrdRowConstant>,
linearRowFilter_caller<5 , T, D, BrdRowConstant>,
linearRowFilter_caller<6 , T, D, BrdRowConstant>,
linearRowFilter_caller<7 , T, D, BrdRowConstant>,
linearRowFilter_caller<8 , T, D, BrdRowConstant>,
linearRowFilter_caller<9 , T, D, BrdRowConstant>,
linearRowFilter_caller<10, T, D, BrdRowConstant>,
linearRowFilter_caller<11, T, D, BrdRowConstant>,
linearRowFilter_caller<12, T, D, BrdRowConstant>,
linearRowFilter_caller<13, T, D, BrdRowConstant>,
linearRowFilter_caller<14, T, D, BrdRowConstant>,
linearRowFilter_caller<15, T, D, BrdRowConstant>,
linearRowFilter_caller<16, T, D, BrdRowConstant>
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowReflect>,
linearRowFilter_caller<2 , T, D, BrdRowReflect>,
linearRowFilter_caller<3 , T, D, BrdRowReflect>,
linearRowFilter_caller<4 , T, D, BrdRowReflect>,
linearRowFilter_caller<5 , T, D, BrdRowReflect>,
linearRowFilter_caller<6 , T, D, BrdRowReflect>,
linearRowFilter_caller<7 , T, D, BrdRowReflect>,
linearRowFilter_caller<8 , T, D, BrdRowReflect>,
linearRowFilter_caller<9 , T, D, BrdRowReflect>,
linearRowFilter_caller<10, T, D, BrdRowReflect>,
linearRowFilter_caller<11, T, D, BrdRowReflect>,
linearRowFilter_caller<12, T, D, BrdRowReflect>,
linearRowFilter_caller<13, T, D, BrdRowReflect>,
linearRowFilter_caller<14, T, D, BrdRowReflect>,
linearRowFilter_caller<15, T, D, BrdRowReflect>,
linearRowFilter_caller<16, T, D, BrdRowReflect>
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowWrap>,
linearRowFilter_caller<2 , T, D, BrdRowWrap>,
linearRowFilter_caller<3 , T, D, BrdRowWrap>,
linearRowFilter_caller<4 , T, D, BrdRowWrap>,
linearRowFilter_caller<5 , T, D, BrdRowWrap>,
linearRowFilter_caller<6 , T, D, BrdRowWrap>,
linearRowFilter_caller<7 , T, D, BrdRowWrap>,
linearRowFilter_caller<8 , T, D, BrdRowWrap>,
linearRowFilter_caller<9 , T, D, BrdRowWrap>,
linearRowFilter_caller<10, T, D, BrdRowWrap>,
linearRowFilter_caller<11, T, D, BrdRowWrap>,
linearRowFilter_caller<12, T, D, BrdRowWrap>,
linearRowFilter_caller<13, T, D, BrdRowWrap>,
linearRowFilter_caller<14, T, D, BrdRowWrap>,
linearRowFilter_caller<15, T, D, BrdRowWrap>,
linearRowFilter_caller<16, T, D, BrdRowWrap>
template <typename T, size_t size> struct SmemType
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
};
template <typename T> struct SmemType<T, 4>
{
typedef T smem_t;
};
}
};
loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
template <typename T> struct SmemType
{
typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
};
template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<int , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template <int KERNEL_SIZE, typename T, typename D, typename B>
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
{
typedef typename SmemType<T>::smem_t smem_t;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
} // namespace row_filter
__shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
END_OPENCV_DEVICE_NAMESPACE
//Offset to the left halo edge
const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (y < src.rows)
{
const T* src_row = src.ptr(y);
//Load main data
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
//Load left halo
#pragma unroll
for(int i = 0; i < HALO_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
//Load right halo
#pragma unroll
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
__syncthreads();
D* dst_row = dst.ptr(y);
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
{
sum_t sum = VecTraits<sum_t>::all(0);
#pragma unroll
for (int j = 0; j < KERNEL_SIZE; ++j)
sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
int dstX = x + i * BLOCK_DIM_X;
if (dstX < src.cols)
dst_row[dstX] = saturate_cast<D>(sum);
}
}
}
template <int ksize, typename T, typename D, template<typename> class B>
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
typedef typename SmemType<T>::smem_t smem_t;
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
B<smem_t> b(src.cols);
linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename D>
void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[5][17] =
{
{
0,
linearRowFilter_caller<1 , T, D, BrdRowReflect101>,
linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
linearRowFilter_caller<3 , T, D, BrdRowReflect101>,
linearRowFilter_caller<4 , T, D, BrdRowReflect101>,
linearRowFilter_caller<5 , T, D, BrdRowReflect101>,
linearRowFilter_caller<6 , T, D, BrdRowReflect101>,
linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
linearRowFilter_caller<9 , T, D, BrdRowReflect101>,
linearRowFilter_caller<10, T, D, BrdRowReflect101>,
linearRowFilter_caller<11, T, D, BrdRowReflect101>,
linearRowFilter_caller<12, T, D, BrdRowReflect101>,
linearRowFilter_caller<13, T, D, BrdRowReflect101>,
linearRowFilter_caller<14, T, D, BrdRowReflect101>,
linearRowFilter_caller<15, T, D, BrdRowReflect101>,
linearRowFilter_caller<16, T, D, BrdRowReflect101>
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowReplicate>,
linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
linearRowFilter_caller<3 , T, D, BrdRowReplicate>,
linearRowFilter_caller<4 , T, D, BrdRowReplicate>,
linearRowFilter_caller<5 , T, D, BrdRowReplicate>,
linearRowFilter_caller<6 , T, D, BrdRowReplicate>,
linearRowFilter_caller<7 , T, D, BrdRowReplicate>,
linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
linearRowFilter_caller<9 , T, D, BrdRowReplicate>,
linearRowFilter_caller<10, T, D, BrdRowReplicate>,
linearRowFilter_caller<11, T, D, BrdRowReplicate>,
linearRowFilter_caller<12, T, D, BrdRowReplicate>,
linearRowFilter_caller<13, T, D, BrdRowReplicate>,
linearRowFilter_caller<14, T, D, BrdRowReplicate>,
linearRowFilter_caller<15, T, D, BrdRowReplicate>,
linearRowFilter_caller<16, T, D, BrdRowReplicate>
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowConstant>,
linearRowFilter_caller<2 , T, D, BrdRowConstant>,
linearRowFilter_caller<3 , T, D, BrdRowConstant>,
linearRowFilter_caller<4 , T, D, BrdRowConstant>,
linearRowFilter_caller<5 , T, D, BrdRowConstant>,
linearRowFilter_caller<6 , T, D, BrdRowConstant>,
linearRowFilter_caller<7 , T, D, BrdRowConstant>,
linearRowFilter_caller<8 , T, D, BrdRowConstant>,
linearRowFilter_caller<9 , T, D, BrdRowConstant>,
linearRowFilter_caller<10, T, D, BrdRowConstant>,
linearRowFilter_caller<11, T, D, BrdRowConstant>,
linearRowFilter_caller<12, T, D, BrdRowConstant>,
linearRowFilter_caller<13, T, D, BrdRowConstant>,
linearRowFilter_caller<14, T, D, BrdRowConstant>,
linearRowFilter_caller<15, T, D, BrdRowConstant>,
linearRowFilter_caller<16, T, D, BrdRowConstant>
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowReflect>,
linearRowFilter_caller<2 , T, D, BrdRowReflect>,
linearRowFilter_caller<3 , T, D, BrdRowReflect>,
linearRowFilter_caller<4 , T, D, BrdRowReflect>,
linearRowFilter_caller<5 , T, D, BrdRowReflect>,
linearRowFilter_caller<6 , T, D, BrdRowReflect>,
linearRowFilter_caller<7 , T, D, BrdRowReflect>,
linearRowFilter_caller<8 , T, D, BrdRowReflect>,
linearRowFilter_caller<9 , T, D, BrdRowReflect>,
linearRowFilter_caller<10, T, D, BrdRowReflect>,
linearRowFilter_caller<11, T, D, BrdRowReflect>,
linearRowFilter_caller<12, T, D, BrdRowReflect>,
linearRowFilter_caller<13, T, D, BrdRowReflect>,
linearRowFilter_caller<14, T, D, BrdRowReflect>,
linearRowFilter_caller<15, T, D, BrdRowReflect>,
linearRowFilter_caller<16, T, D, BrdRowReflect>
},
{
0,
linearRowFilter_caller<1 , T, D, BrdRowWrap>,
linearRowFilter_caller<2 , T, D, BrdRowWrap>,
linearRowFilter_caller<3 , T, D, BrdRowWrap>,
linearRowFilter_caller<4 , T, D, BrdRowWrap>,
linearRowFilter_caller<5 , T, D, BrdRowWrap>,
linearRowFilter_caller<6 , T, D, BrdRowWrap>,
linearRowFilter_caller<7 , T, D, BrdRowWrap>,
linearRowFilter_caller<8 , T, D, BrdRowWrap>,
linearRowFilter_caller<9 , T, D, BrdRowWrap>,
linearRowFilter_caller<10, T, D, BrdRowWrap>,
linearRowFilter_caller<11, T, D, BrdRowWrap>,
linearRowFilter_caller<12, T, D, BrdRowWrap>,
linearRowFilter_caller<13, T, D, BrdRowWrap>,
linearRowFilter_caller<14, T, D, BrdRowWrap>,
linearRowFilter_caller<15, T, D, BrdRowWrap>,
linearRowFilter_caller<16, T, D, BrdRowWrap>
}
};
loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<int , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
} // namespace row_filter
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -62,44 +62,43 @@
#define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__)
#endif
namespace cv { namespace gpu {
void error(const char *error_string, const char *file, const int line, const char *func = "");
void nppError(int err, const char *file, const int line, const char *func = "");
void ncvError(int err, const char *file, const int line, const char *func = "");
void cufftError(int err, const char *file, const int line, const char *func = "");
void cublasError(int err, const char *file, const int line, const char *func = "");
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
namespace cv { namespace gpu
{
if (cudaSuccess != err)
cv::gpu::error(cudaGetErrorString(err), file, line, func);
}
void error(const char *error_string, const char *file, const int line, const char *func = "");
void nppError(int err, const char *file, const int line, const char *func = "");
void ncvError(int err, const char *file, const int line, const char *func = "");
void cufftError(int err, const char *file, const int line, const char *func = "");
void cublasError(int err, const char *file, const int line, const char *func = "");
static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (err < 0)
cv::gpu::nppError(err, file, line, func);
}
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
{
if (cudaSuccess != err)
cv::gpu::error(cudaGetErrorString(err), file, line, func);
}
static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (NCV_SUCCESS != err)
cv::gpu::ncvError(err, file, line, func);
}
static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (err < 0)
cv::gpu::nppError(err, file, line, func);
}
static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
{
if (CUFFT_SUCCESS != err)
cv::gpu::cufftError(err, file, line, func);
}
static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (NCV_SUCCESS != err)
cv::gpu::ncvError(err, file, line, func);
}
static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
{
if (CUBLAS_STATUS_SUCCESS != err)
cv::gpu::cublasError(err, file, line, func);
}
static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
{
if (CUFFT_SUCCESS != err)
cv::gpu::cufftError(err, file, line, func);
}
static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
{
if (CUBLAS_STATUS_SUCCESS != err)
cv::gpu::cublasError(err, file, line, func);
}
}}
#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */

View File

@@ -42,467 +42,465 @@
#include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace split_merge {
template <typename T, size_t elem_size = sizeof(T)>
struct TypeTraits
namespace cv { namespace gpu { namespace device
{
typedef T type;
typedef T type2;
typedef T type3;
typedef T type4;
};
template <typename T>
struct TypeTraits<T, 1>
{
typedef char type;
typedef char2 type2;
typedef char3 type3;
typedef char4 type4;
};
template <typename T>
struct TypeTraits<T, 2>
{
typedef short type;
typedef short2 type2;
typedef short3 type3;
typedef short4 type4;
};
template <typename T>
struct TypeTraits<T, 4>
{
typedef int type;
typedef int2 type2;
typedef int3 type3;
typedef int4 type4;
};
template <typename T>
struct TypeTraits<T, 8>
{
typedef double type;
typedef double2 type2;
//typedef double3 type3;
//typedef double4 type3;
};
typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
//------------------------------------------------------------
// Merge
template <typename T>
__global__ void mergeC2_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type2 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_y[x] = dst_elem;
}
}
template <typename T>
__global__ void mergeC3_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type3 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_y[x] = dst_elem;
}
}
template <>
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
double* dst_y = (double*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[3 * x] = src0_y[x];
dst_y[3 * x + 1] = src1_y[x];
dst_y[3 * x + 2] = src2_y[x];
}
}
template <typename T>
__global__ void mergeC4_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type4 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
const T* src3_y = (const T*)(src3 + y * src3_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_elem.w = src3_y[x];
dst_y[x] = dst_elem;
}
}
template <>
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
const double* src3_y = (const double*)(src3 + y * src3_step);
double2* dst_y = (double2*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
}
}
template <typename T>
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
src[3].data, src[3].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
int total_channels, size_t elem_size,
const cudaStream_t& stream)
{
static MergeFunction merge_func_tbl[] =
namespace split_merge
{
mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
};
template <typename T, size_t elem_size = sizeof(T)>
struct TypeTraits
{
typedef T type;
typedef T type2;
typedef T type3;
typedef T type4;
};
size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
MergeFunction merge_func = merge_func_tbl[merge_func_id];
template <typename T>
struct TypeTraits<T, 1>
{
typedef char type;
typedef char2 type2;
typedef char3 type3;
typedef char4 type4;
};
if (merge_func == 0)
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
template <typename T>
struct TypeTraits<T, 2>
{
typedef short type;
typedef short2 type2;
typedef short3 type3;
typedef short4 type4;
};
merge_func(src, dst, stream);
}
template <typename T>
struct TypeTraits<T, 4>
{
typedef int type;
typedef int2 type2;
typedef int3 type3;
typedef int4 type4;
};
template <typename T>
struct TypeTraits<T, 8>
{
typedef double type;
typedef double2 type2;
//typedef double3 type3;
//typedef double4 type3;
};
typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
//------------------------------------------------------------
// Merge
template <typename T>
__global__ void mergeC2_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type2 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_y[x] = dst_elem;
}
}
template <typename T>
__global__ void mergeC3_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type3 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_y[x] = dst_elem;
}
}
template <>
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
double* dst_y = (double*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[3 * x] = src0_y[x];
dst_y[3 * x + 1] = src1_y[x];
dst_y[3 * x + 2] = src2_y[x];
}
}
template <typename T>
__global__ void mergeC4_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type4 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
const T* src3_y = (const T*)(src3 + y * src3_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_elem.w = src3_y[x];
dst_y[x] = dst_elem;
}
}
template <>
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
const double* src3_y = (const double*)(src3 + y * src3_step);
double2* dst_y = (double2*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
}
}
template <typename T>
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
src[3].data, src[3].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
int total_channels, size_t elem_size,
const cudaStream_t& stream)
{
static MergeFunction merge_func_tbl[] =
{
mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
};
size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
MergeFunction merge_func = merge_func_tbl[merge_func_id];
if (merge_func == 0)
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
merge_func(src, dst, stream);
}
//------------------------------------------------------------
// Split
//------------------------------------------------------------
// Split
template <typename T>
__global__ void splitC2_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step)
{
typedef typename TypeTraits<T>::type2 src_type;
template <typename T>
__global__ void splitC2_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step)
{
typedef typename TypeTraits<T>::type2 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
}
}
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
}
}
template <typename T>
__global__ void splitC3_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
{
typedef typename TypeTraits<T>::type3 src_type;
template <typename T>
__global__ void splitC3_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
{
typedef typename TypeTraits<T>::type3 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
}
}
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
}
}
template <>
__global__ void splitC3_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
template <>
__global__ void splitC3_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src_y = (const double*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
const double* src_y = (const double*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
if (x < cols && y < rows)
{
dst0_y[x] = src_y[3 * x];
dst1_y[x] = src_y[3 * x + 1];
dst2_y[x] = src_y[3 * x + 2];
}
}
if (x < cols && y < rows)
{
dst0_y[x] = src_y[3 * x];
dst1_y[x] = src_y[3 * x + 1];
dst2_y[x] = src_y[3 * x + 2];
}
}
template <typename T>
__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
{
typedef typename TypeTraits<T>::type4 src_type;
template <typename T>
__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
{
typedef typename TypeTraits<T>::type4 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
T* dst3_y = (T*)(dst3 + y * dst3_step);
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
T* dst3_y = (T*)(dst3 + y * dst3_step);
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
dst3_y[x] = src_elem.w;
}
}
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
dst3_y[x] = src_elem.w;
}
}
template <>
__global__ void splitC4_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
template <>
__global__ void splitC4_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double2* src_y = (const double2*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
double* dst3_y = (double*)(dst3 + y * dst3_step);
const double2* src_y = (const double2*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
double* dst3_y = (double*)(dst3 + y * dst3_step);
if (x < cols && y < rows)
{
double2 src_elem1 = src_y[2 * x];
double2 src_elem2 = src_y[2 * x + 1];
dst0_y[x] = src_elem1.x;
dst1_y[x] = src_elem1.y;
dst2_y[x] = src_elem2.x;
dst3_y[x] = src_elem2.y;
}
}
if (x < cols && y < rows)
{
double2 src_elem1 = src_y[2 * x];
double2 src_elem2 = src_y[2 * x + 1];
dst0_y[x] = src_elem1.x;
dst1_y[x] = src_elem1.y;
dst2_y[x] = src_elem2.x;
dst3_y[x] = src_elem2.y;
}
}
template <typename T>
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
cudaSafeCall( cudaGetLastError() );
template <typename T>
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step);
cudaSafeCall( cudaGetLastError() );
template <typename T>
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step,
dst[3].data, dst[3].step);
cudaSafeCall( cudaGetLastError() );
template <typename T>
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{
dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step,
dst[3].data, dst[3].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
{
static SplitFunction split_func_tbl[] =
{
splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
};
void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
{
static SplitFunction split_func_tbl[] =
{
splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
};
size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
SplitFunction split_func = split_func_tbl[split_func_id];
size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
SplitFunction split_func = split_func_tbl[split_func_id];
if (split_func == 0)
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
if (split_func == 0)
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
split_func(src, dst, stream);
}
} // namespace split_merge
END_OPENCV_DEVICE_NAMESPACE
split_func(src, dst, stream);
}
} // namespace split_merge
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -42,496 +42,494 @@
#include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace stereobm {
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
#define ROWSperTHREAD 21 // the number of rows a thread will process
#define BLOCK_W 128 // the thread block width (464)
#define N_DISPARITIES 8
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
__constant__ unsigned int* cminSSDImage;
__constant__ size_t cminSSD_step;
__constant__ int cwidth;
__constant__ int cheight;
__device__ __forceinline__ int SQ(int a)
namespace cv { namespace gpu { namespace device
{
return a * a;
}
template<int RADIUS>
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
{
unsigned int cache = 0;
unsigned int cache2 = 0;
for(int i = 1; i <= RADIUS; i++)
cache += col_ssd[i];
col_ssd_cache[0] = cache;
__syncthreads();
if (threadIdx.x < BLOCK_W - RADIUS)
cache2 = col_ssd_cache[RADIUS];
else
for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
cache2 += col_ssd[i];
return col_ssd[0] + cache + cache2;
}
template<int RADIUS>
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
{
unsigned int ssd[N_DISPARITIES];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
int bestIdx = 0;
for (int i = 0; i < N_DISPARITIES; i++)
namespace stereobm
{
if (mssd == ssd[i])
bestIdx = i;
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
return make_uint2(mssd, bestIdx);
}
#define ROWSperTHREAD 21 // the number of rows a thread will process
template<int RADIUS>
__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
{
unsigned char leftPixel1;
unsigned char leftPixel2;
unsigned char rightPixel1[8];
unsigned char rightPixel2[8];
unsigned int diff1, diff2;
#define BLOCK_W 128 // the thread block width (464)
#define N_DISPARITIES 8
leftPixel1 = imageL[idx1];
leftPixel2 = imageL[idx2];
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
idx1 = idx1 - d;
idx2 = idx2 - d;
__constant__ unsigned int* cminSSDImage;
__constant__ size_t cminSSD_step;
__constant__ int cwidth;
__constant__ int cheight;
rightPixel1[7] = imageR[idx1 - 7];
rightPixel1[0] = imageR[idx1 - 0];
rightPixel1[1] = imageR[idx1 - 1];
rightPixel1[2] = imageR[idx1 - 2];
rightPixel1[3] = imageR[idx1 - 3];
rightPixel1[4] = imageR[idx1 - 4];
rightPixel1[5] = imageR[idx1 - 5];
rightPixel1[6] = imageR[idx1 - 6];
rightPixel2[7] = imageR[idx2 - 7];
rightPixel2[0] = imageR[idx2 - 0];
rightPixel2[1] = imageR[idx2 - 1];
rightPixel2[2] = imageR[idx2 - 2];
rightPixel2[3] = imageR[idx2 - 3];
rightPixel2[4] = imageR[idx2 - 4];
rightPixel2[5] = imageR[idx2 - 5];
rightPixel2[6] = imageR[idx2 - 6];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
diff1 = leftPixel1 - rightPixel1[0];
diff2 = leftPixel2 - rightPixel2[0];
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[1];
diff2 = leftPixel2 - rightPixel2[1];
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[2];
diff2 = leftPixel2 - rightPixel2[2];
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[3];
diff2 = leftPixel2 - rightPixel2[3];
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[4];
diff2 = leftPixel2 - rightPixel2[4];
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[5];
diff2 = leftPixel2 - rightPixel2[5];
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[6];
diff2 = leftPixel2 - rightPixel2[6];
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[7];
diff2 = leftPixel2 - rightPixel2[7];
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
}
template<int RADIUS>
__device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
{
unsigned char leftPixel1;
int idx;
unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
for(int i = 0; i < (2 * RADIUS + 1); i++)
{
idx = y_tex * im_pitch + x_tex;
leftPixel1 = imageL[idx];
idx = idx - d;
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
y_tex += 1;
}
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
}
template<int RADIUS>
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
{
extern __shared__ unsigned int col_ssd_cache[];
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
//#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
#define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
//int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
unsigned char* disparImage = disp.data + X + Y * disp.step;
/* if (X < cwidth)
{
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
*ptr = 0xFFFFFFFF;
}*/
int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
int y_tex;
int x_tex = X - RADIUS;
if (x_tex >= cwidth)
return;
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
{
y_tex = Y - RADIUS;
InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
if (col_ssd_extra > 0)
if (x_tex + BLOCK_W < cwidth)
InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
__syncthreads(); //before MinSSD function
if (X < cwidth - RADIUS && Y < cheight - RADIUS)
__device__ __forceinline__ int SQ(int a)
{
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[0])
{
disparImage[0] = (unsigned char)(d + minSSD.y);
minSSDImage[0] = minSSD.x;
}
return a * a;
}
for(int row = 1; row < end_row; row++)
{
int idx1 = y_tex * img_step + x_tex;
int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
template<int RADIUS>
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
{
unsigned int cache = 0;
unsigned int cache2 = 0;
for(int i = 1; i <= RADIUS; i++)
cache += col_ssd[i];
col_ssd_cache[0] = cache;
__syncthreads();
StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
if (threadIdx.x < BLOCK_W - RADIUS)
cache2 = col_ssd_cache[RADIUS];
else
for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
cache2 += col_ssd[i];
if (col_ssd_extra)
if (x_tex + BLOCK_W < cwidth)
StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
return col_ssd[0] + cache + cache2;
}
y_tex += 1;
template<int RADIUS>
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
{
unsigned int ssd[N_DISPARITIES];
__syncthreads(); //before MinSSD function
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
__syncthreads();
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
int bestIdx = 0;
for (int i = 0; i < N_DISPARITIES; i++)
{
int idx = row * cminSSD_step;
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[idx])
if (mssd == ssd[i])
bestIdx = i;
}
return make_uint2(mssd, bestIdx);
}
template<int RADIUS>
__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
{
unsigned char leftPixel1;
unsigned char leftPixel2;
unsigned char rightPixel1[8];
unsigned char rightPixel2[8];
unsigned int diff1, diff2;
leftPixel1 = imageL[idx1];
leftPixel2 = imageL[idx2];
idx1 = idx1 - d;
idx2 = idx2 - d;
rightPixel1[7] = imageR[idx1 - 7];
rightPixel1[0] = imageR[idx1 - 0];
rightPixel1[1] = imageR[idx1 - 1];
rightPixel1[2] = imageR[idx1 - 2];
rightPixel1[3] = imageR[idx1 - 3];
rightPixel1[4] = imageR[idx1 - 4];
rightPixel1[5] = imageR[idx1 - 5];
rightPixel1[6] = imageR[idx1 - 6];
rightPixel2[7] = imageR[idx2 - 7];
rightPixel2[0] = imageR[idx2 - 0];
rightPixel2[1] = imageR[idx2 - 1];
rightPixel2[2] = imageR[idx2 - 2];
rightPixel2[3] = imageR[idx2 - 3];
rightPixel2[4] = imageR[idx2 - 4];
rightPixel2[5] = imageR[idx2 - 5];
rightPixel2[6] = imageR[idx2 - 6];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
diff1 = leftPixel1 - rightPixel1[0];
diff2 = leftPixel2 - rightPixel2[0];
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[1];
diff2 = leftPixel2 - rightPixel2[1];
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[2];
diff2 = leftPixel2 - rightPixel2[2];
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[3];
diff2 = leftPixel2 - rightPixel2[3];
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[4];
diff2 = leftPixel2 - rightPixel2[4];
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[5];
diff2 = leftPixel2 - rightPixel2[5];
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[6];
diff2 = leftPixel2 - rightPixel2[6];
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[7];
diff2 = leftPixel2 - rightPixel2[7];
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
}
template<int RADIUS>
__device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
{
unsigned char leftPixel1;
int idx;
unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
for(int i = 0; i < (2 * RADIUS + 1); i++)
{
idx = y_tex * im_pitch + x_tex;
leftPixel1 = imageL[idx];
idx = idx - d;
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
y_tex += 1;
}
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
}
template<int RADIUS>
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
{
extern __shared__ unsigned int col_ssd_cache[];
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
//#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
#define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
//int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
unsigned char* disparImage = disp.data + X + Y * disp.step;
/* if (X < cwidth)
{
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
*ptr = 0xFFFFFFFF;
}*/
int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
int y_tex;
int x_tex = X - RADIUS;
if (x_tex >= cwidth)
return;
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
{
y_tex = Y - RADIUS;
InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
if (col_ssd_extra > 0)
if (x_tex + BLOCK_W < cwidth)
InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
__syncthreads(); //before MinSSD function
if (X < cwidth - RADIUS && Y < cheight - RADIUS)
{
disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
minSSDImage[idx] = minSSD.x;
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[0])
{
disparImage[0] = (unsigned char)(d + minSSD.y);
minSSDImage[0] = minSSD.x;
}
}
for(int row = 1; row < end_row; row++)
{
int idx1 = y_tex * img_step + x_tex;
int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
__syncthreads();
StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
if (col_ssd_extra)
if (x_tex + BLOCK_W < cwidth)
StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
y_tex += 1;
__syncthreads(); //before MinSSD function
if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
{
int idx = row * cminSSD_step;
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[idx])
{
disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
minSSDImage[idx] = minSSD.x;
}
}
} // for row loop
} // for d loop
}
template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)
{
dim3 grid(1,1,1);
dim3 threads(BLOCK_W, 1, 1);
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
};
typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);
const static kernel_caller_t callers[] =
{
0,
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
};
const int calles_num = sizeof(callers)/sizeof(callers[0]);
void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
{
int winsz2 = winsz >> 1;
if (winsz2 == 0 || winsz2 >= calles_num)
cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
callers[winsz2](left, right, disp, maxdisp, stream);
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
__global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < output.cols && y < output.rows)
{
int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
(int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +
(int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
output.ptr(y)[x] = conv & 0xFF;
}
}
void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, threads.y);
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForSobel ) );
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
__device__ __forceinline__ float sobel(int x, int y)
{
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
return fabs(conv);
}
__device__ float CalcSums(float *cols, float *cols_cache, int winsz)
{
float cache = 0;
float cache2 = 0;
int winsz2 = winsz/2;
for(int i = 1; i <= winsz2; i++)
cache += cols[i];
cols_cache[0] = cache;
__syncthreads();
if (threadIdx.x < blockDim.x - winsz2)
cache2 = cols_cache[winsz2];
else
for(int i = winsz2 + 1; i < winsz; i++)
cache2 += cols[i];
return cols[0] + cache + cache2;
}
#define RpT (2 * ROWSperTHREAD) // got experimentally
__global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
{
int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2;
extern __shared__ float cols_cache[];
float *cols = cols_cache + blockDim.x + threadIdx.x;
float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int beg_row = blockIdx.y * RpT;
int end_row = ::min(beg_row + RpT, disp.rows);
if (x < disp.cols)
{
int y = beg_row;
float sum = 0;
float sum_extra = 0;
for(int i = y - winsz2; i <= y + winsz2; ++i)
{
sum += sobel(x - winsz2, i);
if (cols_extra)
sum_extra += sobel(x + blockDim.x - winsz2, i);
}
*cols = sum;
if (cols_extra)
*cols_extra = sum_extra;
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp.data[y * disp.step + x] = 0;
__syncthreads();
for(int y = beg_row + 1; y < end_row; ++y)
{
sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
*cols = sum;
if (cols_extra)
{
sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
*cols_extra = sum_extra;
}
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp.data[y * disp.step + x] = 0;
__syncthreads();
}
}
} // for row loop
} // for d loop
}
template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)
{
dim3 grid(1,1,1);
dim3 threads(BLOCK_W, 1, 1);
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
};
typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);
const static kernel_caller_t callers[] =
{
0,
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
};
const int calles_num = sizeof(callers)/sizeof(callers[0]);
void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
{
int winsz2 = winsz >> 1;
if (winsz2 == 0 || winsz2 >= calles_num)
cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
callers[winsz2](left, right, disp, maxdisp, stream);
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
__global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < output.cols && y < output.rows)
{
int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
(int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +
(int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
output.ptr(y)[x] = conv & 0xFF;
}
}
void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, threads.y);
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForSobel ) );
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
__device__ __forceinline__ float sobel(int x, int y)
{
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
return fabs(conv);
}
__device__ float CalcSums(float *cols, float *cols_cache, int winsz)
{
float cache = 0;
float cache2 = 0;
int winsz2 = winsz/2;
for(int i = 1; i <= winsz2; i++)
cache += cols[i];
cols_cache[0] = cache;
__syncthreads();
if (threadIdx.x < blockDim.x - winsz2)
cache2 = cols_cache[winsz2];
else
for(int i = winsz2 + 1; i < winsz; i++)
cache2 += cols[i];
return cols[0] + cache + cache2;
}
#define RpT (2 * ROWSperTHREAD) // got experimentally
__global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
{
int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2;
extern __shared__ float cols_cache[];
float *cols = cols_cache + blockDim.x + threadIdx.x;
float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int beg_row = blockIdx.y * RpT;
int end_row = ::min(beg_row + RpT, disp.rows);
if (x < disp.cols)
{
int y = beg_row;
float sum = 0;
float sum_extra = 0;
for(int i = y - winsz2; i <= y + winsz2; ++i)
{
sum += sobel(x - winsz2, i);
if (cols_extra)
sum_extra += sobel(x + blockDim.x - winsz2, i);
}
*cols = sum;
if (cols_extra)
*cols_extra = sum_extra;
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp.data[y * disp.step + x] = 0;
__syncthreads();
for(int y = beg_row + 1; y < end_row; ++y)
void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
{
sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
*cols = sum;
avgTexturenessThreshold *= winsz * winsz;
if (cols_extra)
{
sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
*cols_extra = sum_extra;
}
texForTF.filterMode = cudaFilterModeLinear;
texForTF.addressMode[0] = cudaAddressModeWrap;
texForTF.addressMode[1] = cudaAddressModeWrap;
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp.data[y * disp.step + x] = 0;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
__syncthreads();
dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
}
}
}
void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
{
avgTexturenessThreshold *= winsz * winsz;
texForTF.filterMode = cudaFilterModeLinear;
texForTF.addressMode[0] = cudaAddressModeWrap;
texForTF.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
}
} // namespace stereobm
END_OPENCV_DEVICE_NAMESPACE
} // namespace stereobm
}}} // namespace cv { namespace gpu { namespace device

View File

@@ -44,489 +44,487 @@
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/limits.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace stereobp {
///////////////////////////////////////////////////////////////
/////////////////////// load constants ////////////////////////
///////////////////////////////////////////////////////////////
__constant__ int cndisp;
__constant__ float cmax_data_term;
__constant__ float cdata_weight;
__constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump;
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
namespace cv { namespace gpu { namespace device
{
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
}
///////////////////////////////////////////////////////////////
////////////////////////// comp data //////////////////////////
///////////////////////////////////////////////////////////////
template <int cn> struct PixDiff;
template <> struct PixDiff<1>
{
__device__ __forceinline__ PixDiff(const uchar* ls)
namespace stereobp
{
l = *ls;
}
__device__ __forceinline__ float operator()(const uchar* rs) const
{
return ::abs((int)l - *rs);
}
uchar l;
};
template <> struct PixDiff<3>
{
__device__ __forceinline__ PixDiff(const uchar* ls)
{
l = *((uchar3*)ls);
}
__device__ __forceinline__ float operator()(const uchar* rs) const
{
const float tr = 0.299f;
const float tg = 0.587f;
const float tb = 0.114f;
///////////////////////////////////////////////////////////////
/////////////////////// load constants ////////////////////////
///////////////////////////////////////////////////////////////
float val = tb * ::abs((int)l.x - rs[0]);
val += tg * ::abs((int)l.y - rs[1]);
val += tr * ::abs((int)l.z - rs[2]);
__constant__ int cndisp;
__constant__ float cmax_data_term;
__constant__ float cdata_weight;
__constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump;
return val;
}
uchar3 l;
};
template <> struct PixDiff<4>
{
__device__ __forceinline__ PixDiff(const uchar* ls)
{
l = *((uchar4*)ls);
}
__device__ __forceinline__ float operator()(const uchar* rs) const
{
const float tr = 0.299f;
const float tg = 0.587f;
const float tb = 0.114f;
uchar4 r = *((uchar4*)rs);
float val = tb * ::abs((int)l.x - r.x);
val += tg * ::abs((int)l.y - r.y);
val += tr * ::abs((int)l.z - r.z);
return val;
}
uchar4 l;
};
template <int cn, typename D>
__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
{
const uchar* ls = left.ptr(y) + x * cn;
const PixDiff<cn> pixDiff(ls);
const uchar* rs = right.ptr(y) + x * cn;
D* ds = data.ptr(y) + x;
const size_t disp_step = data.step * left.rows;
for (int disp = 0; disp < cndisp; disp++)
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
{
if (x - disp >= 1)
{
float val = pixDiff(rs - disp * cn);
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
}
ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
///////////////////////////////////////////////////////////////
////////////////////////// comp data //////////////////////////
///////////////////////////////////////////////////////////////
template <int cn> struct PixDiff;
template <> struct PixDiff<1>
{
__device__ __forceinline__ PixDiff(const uchar* ls)
{
l = *ls;
}
else
__device__ __forceinline__ float operator()(const uchar* rs) const
{
ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
return ::abs((int)l - *rs);
}
}
}
}
template<typename T, typename D>
void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////
//////////////////////// data step down ///////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst_cols && y < dst_rows)
{
for (int d = 0; d < cndisp; ++d)
uchar l;
};
template <> struct PixDiff<3>
{
float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
}
}
}
template<typename T>
void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(dst_cols, threads.x);
grid.y = divUp(dst_rows, threads.y);
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
///////////////////////////////////////////////////////////////
/////////////////// level up messages ////////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst_cols && y < dst_rows)
{
const size_t dst_disp_step = dst.step * dst_rows;
const size_t src_disp_step = src.step * src_rows;
T* dstr = dst.ptr(y ) + x;
const T* srcr = src.ptr(y/2) + x/2;
for (int d = 0; d < cndisp; ++d)
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
}
}
template <typename T>
void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(dst_cols, threads.x);
grid.y = divUp(dst_rows, threads.y);
int src_idx = (dst_idx + 1) & 1;
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
///////////////////////////////////////////////////////////////
//////////////////// calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__device__ void calc_min_linear_penalty(T* dst, size_t step)
{
float prev = dst[0];
float cur;
for (int disp = 1; disp < cndisp; ++disp)
{
prev += cdisc_single_jump;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = saturate_cast<T>(prev);
}
prev = cur;
}
prev = dst[(cndisp - 1) * step];
for (int disp = cndisp - 2; disp >= 0; disp--)
{
prev += cdisc_single_jump;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = saturate_cast<T>(prev);
}
prev = cur;
}
}
template <typename T>
__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
{
float minimum = device::numeric_limits<float>::max();
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = msg1[msg_disp_step * i];
dst_reg += msg2[msg_disp_step * i];
dst_reg += msg3[msg_disp_step * i];
dst_reg += data[data_disp_step * i];
if (dst_reg < minimum)
minimum = dst_reg;
dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
}
calc_min_linear_penalty(dst, msg_disp_step);
minimum += cmax_disc_term;
float sum = 0;
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = dst[msg_disp_step * i];
if (dst_reg > minimum)
{
dst_reg = minimum;
dst[msg_disp_step * i] = saturate_cast<T>(minimum);
}
sum += dst_reg;
}
sum /= cndisp;
for(int i = 0; i < cndisp; ++i)
dst[msg_disp_step * i] -= sum;
}
template <typename T>
__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
{
T* us = u.ptr(y) + x;
T* ds = d + y * u.step + x;
T* ls = l + y * u.step + x;
T* rs = r + y * u.step + x;
const T* dt = data.ptr(y) + x;
size_t msg_disp_step = u.step * rows;
size_t data_disp_step = data.step * rows;
message(us + u.step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
message(ds - u.step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
}
}
template <typename T>
void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(cols, threads.x << 1);
grid.y = divUp(rows, threads.y);
for(int t = 0; t < iters; ++t)
{
one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
DevMem2D_<short> disp)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
{
const T* us = u.ptr(y + 1) + x;
const T* ds = d + (y - 1) * u.step + x;
const T* ls = l + y * u.step + (x + 1);
const T* rs = r + y * u.step + (x - 1);
const T* dt = data + y * u.step + x;
size_t disp_step = disp.rows * u.step;
int best = 0;
float best_val = numeric_limits<float>::max();
for (int d = 0; d < cndisp; ++d)
{
float val = us[d * disp_step];
val += ds[d * disp_step];
val += ls[d * disp_step];
val += rs[d * disp_step];
val += dt[d * disp_step];
if (val < best_val)
__device__ __forceinline__ PixDiff(const uchar* ls)
{
best_val = val;
best = d;
l = *((uchar3*)ls);
}
__device__ __forceinline__ float operator()(const uchar* rs) const
{
const float tr = 0.299f;
const float tg = 0.587f;
const float tb = 0.114f;
float val = tb * ::abs((int)l.x - rs[0]);
val += tg * ::abs((int)l.y - rs[1]);
val += tr * ::abs((int)l.z - rs[2]);
return val;
}
uchar3 l;
};
template <> struct PixDiff<4>
{
__device__ __forceinline__ PixDiff(const uchar* ls)
{
l = *((uchar4*)ls);
}
__device__ __forceinline__ float operator()(const uchar* rs) const
{
const float tr = 0.299f;
const float tg = 0.587f;
const float tb = 0.114f;
uchar4 r = *((uchar4*)rs);
float val = tb * ::abs((int)l.x - r.x);
val += tg * ::abs((int)l.y - r.y);
val += tr * ::abs((int)l.z - r.z);
return val;
}
uchar4 l;
};
template <int cn, typename D>
__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
{
const uchar* ls = left.ptr(y) + x * cn;
const PixDiff<cn> pixDiff(ls);
const uchar* rs = right.ptr(y) + x * cn;
D* ds = data.ptr(y) + x;
const size_t disp_step = data.step * left.rows;
for (int disp = 0; disp < cndisp; disp++)
{
if (x - disp >= 1)
{
float val = pixDiff(rs - disp * cn);
ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
}
else
{
ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
}
}
}
}
disp.ptr(y)[x] = saturate_cast<short>(best);
}
}
template<typename T, typename D>
void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
template <typename T>
void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
const DevMem2D_<short>& disp, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
cudaSafeCall( cudaGetLastError() );
comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
} // namespace stereobp
comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
END_OPENCV_DEVICE_NAMESPACE
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(left.cols, threads.x);
grid.y = divUp(left.rows, threads.y);
comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////
//////////////////////// data step down ///////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst_cols && y < dst_rows)
{
for (int d = 0; d < cndisp; ++d)
{
float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
}
}
}
template<typename T>
void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(dst_cols, threads.x);
grid.y = divUp(dst_rows, threads.y);
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
///////////////////////////////////////////////////////////////
/////////////////// level up messages ////////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst_cols && y < dst_rows)
{
const size_t dst_disp_step = dst.step * dst_rows;
const size_t src_disp_step = src.step * src_rows;
T* dstr = dst.ptr(y ) + x;
const T* srcr = src.ptr(y/2) + x/2;
for (int d = 0; d < cndisp; ++d)
dstr[d * dst_disp_step] = srcr[d * src_disp_step];
}
}
template <typename T>
void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(dst_cols, threads.x);
grid.y = divUp(dst_rows, threads.y);
int src_idx = (dst_idx + 1) & 1;
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
///////////////////////////////////////////////////////////////
//////////////////// calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__device__ void calc_min_linear_penalty(T* dst, size_t step)
{
float prev = dst[0];
float cur;
for (int disp = 1; disp < cndisp; ++disp)
{
prev += cdisc_single_jump;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = saturate_cast<T>(prev);
}
prev = cur;
}
prev = dst[(cndisp - 1) * step];
for (int disp = cndisp - 2; disp >= 0; disp--)
{
prev += cdisc_single_jump;
cur = dst[step * disp];
if (prev < cur)
{
cur = prev;
dst[step * disp] = saturate_cast<T>(prev);
}
prev = cur;
}
}
template <typename T>
__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
{
float minimum = device::numeric_limits<float>::max();
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = msg1[msg_disp_step * i];
dst_reg += msg2[msg_disp_step * i];
dst_reg += msg3[msg_disp_step * i];
dst_reg += data[data_disp_step * i];
if (dst_reg < minimum)
minimum = dst_reg;
dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
}
calc_min_linear_penalty(dst, msg_disp_step);
minimum += cmax_disc_term;
float sum = 0;
for(int i = 0; i < cndisp; ++i)
{
float dst_reg = dst[msg_disp_step * i];
if (dst_reg > minimum)
{
dst_reg = minimum;
dst[msg_disp_step * i] = saturate_cast<T>(minimum);
}
sum += dst_reg;
}
sum /= cndisp;
for(int i = 0; i < cndisp; ++i)
dst[msg_disp_step * i] -= sum;
}
template <typename T>
__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
{
T* us = u.ptr(y) + x;
T* ds = d + y * u.step + x;
T* ls = l + y * u.step + x;
T* rs = r + y * u.step + x;
const T* dt = data.ptr(y) + x;
size_t msg_disp_step = u.step * rows;
size_t data_disp_step = data.step * rows;
message(us + u.step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
message(ds - u.step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
}
}
template <typename T>
void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(cols, threads.x << 1);
grid.y = divUp(rows, threads.y);
for(int t = 0; t < iters; ++t)
{
one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////
template <typename T>
__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
DevMem2D_<short> disp)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
{
const T* us = u.ptr(y + 1) + x;
const T* ds = d + (y - 1) * u.step + x;
const T* ls = l + y * u.step + (x + 1);
const T* rs = r + y * u.step + (x - 1);
const T* dt = data + y * u.step + x;
size_t disp_step = disp.rows * u.step;
int best = 0;
float best_val = numeric_limits<float>::max();
for (int d = 0; d < cndisp; ++d)
{
float val = us[d * disp_step];
val += ds[d * disp_step];
val += ls[d * disp_step];
val += rs[d * disp_step];
val += dt[d * disp_step];
if (val < best_val)
{
best_val = val;
best = d;
}
}
disp.ptr(y)[x] = saturate_cast<short>(best);
}
}
template <typename T>
void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
const DevMem2D_<short>& disp, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y);
output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
} // namespace stereobp
}}} // namespace cv { namespace gpu { namespace device

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff