scan operations are moved in separate header
This commit is contained in:
parent
8748cbc232
commit
e7f6c4b7ef
@ -1,8 +1,8 @@
|
|||||||
if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
|
if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
|
||||||
message(STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.)
|
message(STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.)
|
||||||
return()
|
return()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
find_package(CUDA 4.1)
|
find_package(CUDA 4.1)
|
||||||
|
|
||||||
if(CUDA_FOUND)
|
if(CUDA_FOUND)
|
||||||
@ -23,7 +23,7 @@ if(CUDA_FOUND)
|
|||||||
else()
|
else()
|
||||||
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
|
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
|
set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
|
||||||
|
|
||||||
string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
|
string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
|
||||||
@ -89,8 +89,8 @@ if(CUDA_FOUND)
|
|||||||
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
|
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
|
# we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
|
||||||
set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
|
set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
|
||||||
string(REPLACE "-ggdb3" "" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
|
string(REPLACE "-ggdb3" "" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
|
||||||
CUDA_COMPILE(${VAR} ${ARGN})
|
CUDA_COMPILE(${VAR} ${ARGN})
|
||||||
set(CMAKE_CXX_DEBUG_FLAGS ${CMAKE_CXX_FLAGS_DEBUG_})
|
set(CMAKE_CXX_DEBUG_FLAGS ${CMAKE_CXX_FLAGS_DEBUG_})
|
||||||
|
@ -629,10 +629,6 @@ CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, doubl
|
|||||||
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
|
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
|
||||||
CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
|
CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
|
||||||
|
|
||||||
//! resizes the image
|
|
||||||
//! Supports INTER_AREA
|
|
||||||
CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize, double fx=0, double fy=0, int interpolation = INTER_AREA, Stream& stream = Stream::Null());
|
|
||||||
|
|
||||||
//! warps the image using affine transformation
|
//! warps the image using affine transformation
|
||||||
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
|
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
|
||||||
CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,
|
CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR,
|
||||||
|
@ -118,10 +118,10 @@ GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
|
|||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
|
INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
|
||||||
ALL_DEVICES,
|
ALL_DEVICES,
|
||||||
testing::Values(perf::sz1080p, cv::Size(4096, 2048)),
|
testing::Values(perf::sz1080p/*, cv::Size(4096, 2048)*/),
|
||||||
testing::Values(MatType(CV_8UC1)/*, MatType(CV_8UC3), MatType(CV_8UC4),
|
testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
|
||||||
MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
|
MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
|
||||||
MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
|
MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
|
||||||
testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
|
testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
@ -1253,7 +1253,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
const T val;
|
const T val;
|
||||||
|
|
||||||
__host__ explicit CompareScalar(T val) : val(val) {}
|
__host__ explicit CompareScalar(T val_) : val(val_) {}
|
||||||
|
|
||||||
__device__ __forceinline__ uchar operator()(T src) const
|
__device__ __forceinline__ uchar operator()(T src) const
|
||||||
{
|
{
|
||||||
@ -1266,7 +1266,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
const TYPE_VEC(T, 2) val;
|
const TYPE_VEC(T, 2) val;
|
||||||
|
|
||||||
__host__ explicit CompareScalar(TYPE_VEC(T, 2) val) : val(val) {}
|
__host__ explicit CompareScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
|
||||||
|
|
||||||
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
|
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
|
||||||
{
|
{
|
||||||
@ -1281,7 +1281,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
const TYPE_VEC(T, 3) val;
|
const TYPE_VEC(T, 3) val;
|
||||||
|
|
||||||
__host__ explicit CompareScalar(TYPE_VEC(T, 3) val) : val(val) {}
|
__host__ explicit CompareScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
|
||||||
|
|
||||||
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
|
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
|
||||||
{
|
{
|
||||||
@ -1297,7 +1297,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
const TYPE_VEC(T, 4) val;
|
const TYPE_VEC(T, 4) val;
|
||||||
|
|
||||||
__host__ explicit CompareScalar(TYPE_VEC(T, 4) val) : val(val) {}
|
__host__ explicit CompareScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
|
||||||
|
|
||||||
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
|
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
|
||||||
{
|
{
|
||||||
|
@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
struct Mask8U
|
struct Mask8U
|
||||||
{
|
{
|
||||||
explicit Mask8U(PtrStepb mask): mask(mask) {}
|
explicit Mask8U(PtrStepb mask_): mask(mask_) {}
|
||||||
|
|
||||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||||
{
|
{
|
||||||
|
@ -46,7 +46,8 @@
|
|||||||
#include "opencv2/gpu/device/vec_math.hpp"
|
#include "opencv2/gpu/device/vec_math.hpp"
|
||||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||||
#include "opencv2/gpu/device/filters.hpp"
|
#include "opencv2/gpu/device/filters.hpp"
|
||||||
# include <cfloat>
|
#include <cfloat>
|
||||||
|
#include <opencv2/gpu/device/scan.hpp>
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
@ -285,367 +286,5 @@ namespace cv { namespace gpu { namespace device
|
|||||||
typedef float scan_line_type;
|
typedef float scan_line_type;
|
||||||
};
|
};
|
||||||
|
|
||||||
// template <typename T>
|
|
||||||
// __global__ void resize_area_scan(const DevMem2D_<T> src, DevMem2D_<T> dst, int fx, int fy, DevMem2D_<T> buffer)
|
|
||||||
// {
|
|
||||||
// typedef typename scan_traits<T>::scan_line_type W;
|
|
||||||
// extern __shared__ W line[];
|
|
||||||
|
|
||||||
// const int x = threadIdx.x;
|
|
||||||
// const int y = blockIdx.x;
|
|
||||||
|
|
||||||
// if (y >= src.rows) return;
|
|
||||||
|
|
||||||
// int offset = 1;
|
|
||||||
|
|
||||||
// line[2 * x + 0] = src(y, 2 * x + 0);
|
|
||||||
// line[2 * x + 1] = src(y, 2 * x + 1);
|
|
||||||
|
|
||||||
// __syncthreads();//???
|
|
||||||
// // reduction
|
|
||||||
// for (int d = blockDim.x; d > 0; d >>= 1)
|
|
||||||
// {
|
|
||||||
// __syncthreads();
|
|
||||||
// if (x < d)
|
|
||||||
// {
|
|
||||||
// int ai = 2 * x * offset -1 + 1 * offset;
|
|
||||||
// int bi = 2 * x * offset -1 + 2 * offset;
|
|
||||||
// line[bi] += line[ai];
|
|
||||||
// }
|
|
||||||
|
|
||||||
// offset *= 2;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// __syncthreads();
|
|
||||||
// // convolution
|
|
||||||
// if (x == 0) { line[(blockDim.x << 1) - 1] = 0; printf("offset: %d!!!!!!!!!!!!!\n", fx);}
|
|
||||||
|
|
||||||
// for (int d = 1; d < (blockDim.x << 1); d *= 2)
|
|
||||||
// {
|
|
||||||
// offset >>= 1;
|
|
||||||
|
|
||||||
// __syncthreads();
|
|
||||||
// if (x < d)
|
|
||||||
// {
|
|
||||||
// int ai = offset * 2 * x + 1 * offset - 1;
|
|
||||||
// int bi = offset * 2 * x + 2 * offset - 1;
|
|
||||||
|
|
||||||
// W t = line[ai];
|
|
||||||
// line[ai] = line[bi];
|
|
||||||
// line[bi] += t;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// __syncthreads();
|
|
||||||
|
|
||||||
// // calculate sum
|
|
||||||
// int start = 0;
|
|
||||||
// int out_idx = 0;
|
|
||||||
// int end = start + fx;
|
|
||||||
// while (start < (blockDim.x << 1) && end < (blockDim.x << 1))
|
|
||||||
// {
|
|
||||||
// buffer(y, out_idx) = saturate_cast<T>((line[end] - line[start]) / fx);
|
|
||||||
// start = end;
|
|
||||||
// end = start + fx;
|
|
||||||
// out_idx++;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// }
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
__device__ void scan_y(DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,int fx, int fy, DevMem2D_<T> dst,
|
|
||||||
typename scan_traits<T>::scan_line_type* line, int g_base)
|
|
||||||
{
|
|
||||||
typedef typename scan_traits<T>::scan_line_type W;
|
|
||||||
|
|
||||||
const int y = threadIdx.x;
|
|
||||||
const int x = blockIdx.x;
|
|
||||||
|
|
||||||
float scale = 1.f / (fx * fy);
|
|
||||||
|
|
||||||
if (x >= buffer.cols) return;
|
|
||||||
|
|
||||||
int offset = 1;
|
|
||||||
line[2 * y + 0] = buffer((g_base * fy) + 2 * y + 1, x);
|
|
||||||
|
|
||||||
if (y != (blockDim.x -1) )
|
|
||||||
line[2 * y + 1] = buffer((g_base * fy) + 2 * y + 2, x);
|
|
||||||
else
|
|
||||||
line[2 * y + 1] = 0;
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// reduction
|
|
||||||
for (int d = blockDim.x; d > 0; d >>= 1)
|
|
||||||
{
|
|
||||||
__syncthreads();
|
|
||||||
if (y < d)
|
|
||||||
{
|
|
||||||
int ai = 2 * y * offset -1 + 1 * offset;
|
|
||||||
int bi = 2 * y * offset -1 + 2 * offset;
|
|
||||||
line[bi] += line[ai];
|
|
||||||
}
|
|
||||||
|
|
||||||
offset *= 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
// convolution
|
|
||||||
if (y == 0) line[(blockDim.x << 1) - 1] = (W)buffer(0, x);
|
|
||||||
|
|
||||||
for (int d = 1; d < (blockDim.x << 1); d *= 2)
|
|
||||||
{
|
|
||||||
offset >>= 1;
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
if (y < d)
|
|
||||||
{
|
|
||||||
int ai = offset * 2 * y + 1 * offset - 1;
|
|
||||||
int bi = offset * 2 * y + 2 * offset - 1;
|
|
||||||
|
|
||||||
|
|
||||||
W t = line[ai];
|
|
||||||
line[ai] = line[bi];
|
|
||||||
line[bi] += t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (y < dst.rows)
|
|
||||||
{
|
|
||||||
W start = (y == 0)? (W)0:line[y * fy -1];
|
|
||||||
W end = line[y * fy + fy - 1];
|
|
||||||
dst(g_base + y ,x) = saturate_cast<T>((end - start) * scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
__device__ void scan_x(const DevMem2D_<T> src, int fx, int fy, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,
|
|
||||||
typename scan_traits<T>::scan_line_type* line, int g_base)
|
|
||||||
{
|
|
||||||
typedef typename scan_traits<T>::scan_line_type W;
|
|
||||||
|
|
||||||
const int x = threadIdx.x;
|
|
||||||
const int y = blockIdx.x;
|
|
||||||
|
|
||||||
float scale = 1.f / (fx * fy);
|
|
||||||
|
|
||||||
if (y >= src.rows) return;
|
|
||||||
|
|
||||||
int offset = 1;
|
|
||||||
|
|
||||||
line[2 * x + 0] = (W)src(y, (g_base * fx) + 2 * x + 1);
|
|
||||||
|
|
||||||
if (x != (blockDim.x -1) )
|
|
||||||
line[2 * x + 1] = (W)src(y, (g_base * fx) + 2 * x + 2);
|
|
||||||
else
|
|
||||||
line[2 * x + 1] = 0;
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// reduction
|
|
||||||
for (int d = blockDim.x; d > 0; d >>= 1)
|
|
||||||
{
|
|
||||||
__syncthreads();
|
|
||||||
if (x < d)
|
|
||||||
{
|
|
||||||
int ai = 2 * x * offset -1 + 1 * offset;
|
|
||||||
int bi = 2 * x * offset -1 + 2 * offset;
|
|
||||||
line[bi] += line[ai];
|
|
||||||
}
|
|
||||||
|
|
||||||
offset *= 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
// convolution
|
|
||||||
if (x == 0) line[(blockDim.x << 1) - 1] = (W)src(y, 0);
|
|
||||||
|
|
||||||
for (int d = 1; d < (blockDim.x << 1); d *= 2)
|
|
||||||
{
|
|
||||||
offset >>= 1;
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
if (x < d)
|
|
||||||
{
|
|
||||||
int ai = offset * 2 * x + 1 * offset - 1;
|
|
||||||
int bi = offset * 2 * x + 2 * offset - 1;
|
|
||||||
|
|
||||||
W t = line[ai];
|
|
||||||
line[ai] = line[bi];
|
|
||||||
line[bi] += t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (x < buffer.cols)
|
|
||||||
{
|
|
||||||
W start = (x == 0)? (W)0:line[x * fx -1];
|
|
||||||
W end = line[x * fx + fx - 1];
|
|
||||||
buffer(y, g_base + x) =(end - start);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
enum ScanKind { exclusive, inclusive } ;
|
|
||||||
|
|
||||||
template <ScanKind Kind , class T>
|
|
||||||
__device__ __forceinline__ T scan_warp ( volatile T *ptr , const unsigned int idx = threadIdx.x )
|
|
||||||
{
|
|
||||||
const unsigned int lane = idx & 31;
|
|
||||||
|
|
||||||
if ( lane >= 1) ptr [idx ] = ptr [idx - 1] + ptr [idx];
|
|
||||||
if ( lane >= 2) ptr [idx ] = ptr [idx - 2] + ptr [idx];
|
|
||||||
if ( lane >= 4) ptr [idx ] = ptr [idx - 4] + ptr [idx];
|
|
||||||
if ( lane >= 8) ptr [idx ] = ptr [idx - 8] + ptr [idx];
|
|
||||||
if ( lane >= 16) ptr [idx ] = ptr [idx - 16] + ptr [idx];
|
|
||||||
|
|
||||||
if( Kind == inclusive )
|
|
||||||
return ptr [idx ];
|
|
||||||
else
|
|
||||||
return (lane > 0) ? ptr [idx - 1] : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <ScanKind Kind , class T>
|
|
||||||
__device__ __forceinline__ T scan_block( volatile T *ptr)
|
|
||||||
{
|
|
||||||
const unsigned int idx = threadIdx.x;
|
|
||||||
const unsigned int lane = idx & 31;
|
|
||||||
const unsigned int warp = idx >> 5;
|
|
||||||
|
|
||||||
T val = scan_warp <Kind>( ptr , idx );
|
|
||||||
__syncthreads ();
|
|
||||||
|
|
||||||
if( lane == 31 )
|
|
||||||
ptr [ warp ] = ptr [idx ];
|
|
||||||
|
|
||||||
__syncthreads ();
|
|
||||||
|
|
||||||
if( warp == 0 )
|
|
||||||
scan_warp<inclusive>( ptr , idx );
|
|
||||||
|
|
||||||
__syncthreads ();
|
|
||||||
|
|
||||||
if ( warp > 0)
|
|
||||||
val = ptr [warp -1] + val;
|
|
||||||
|
|
||||||
__syncthreads ();
|
|
||||||
|
|
||||||
ptr[idx] = val;
|
|
||||||
|
|
||||||
__syncthreads ();
|
|
||||||
|
|
||||||
return val ;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T, typename W>
|
|
||||||
__global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines, int stride)
|
|
||||||
{
|
|
||||||
extern __shared__ W sbuf[];
|
|
||||||
|
|
||||||
const unsigned int tid = threadIdx. x;
|
|
||||||
|
|
||||||
// load line-block on shared memory
|
|
||||||
int y = blockIdx.x / thred_lines;
|
|
||||||
int input_stride = (blockIdx.x % thred_lines) * stride;
|
|
||||||
int x = input_stride + tid;
|
|
||||||
|
|
||||||
// store global data in shared memory
|
|
||||||
if (x < src.cols && y < src.rows)
|
|
||||||
sbuf[tid] = src(y, x);
|
|
||||||
else
|
|
||||||
sbuf[tid] = 0;
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
scan_block<inclusive, W>(sbuf);
|
|
||||||
|
|
||||||
float scale = __fdividef(1.f, fx);
|
|
||||||
int out_stride = input_stride / fx;
|
|
||||||
int count = blockDim.x / fx;
|
|
||||||
|
|
||||||
if (tid < count)
|
|
||||||
{
|
|
||||||
int start_idx = (tid == 0)? 0 : tid * fx - 1;
|
|
||||||
int end_idx = tid * fx + fx - 1;
|
|
||||||
|
|
||||||
W start = (tid == 0)? (W)0:sbuf[start_idx];
|
|
||||||
W end = sbuf[end_idx];
|
|
||||||
|
|
||||||
dst(y, out_stride + tid) = (end - start);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T, typename W>
|
|
||||||
__global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines, int stride)
|
|
||||||
{
|
|
||||||
extern __shared__ W sbuf[];
|
|
||||||
|
|
||||||
const unsigned int tid = threadIdx. x;
|
|
||||||
|
|
||||||
// load line-block on shared memory
|
|
||||||
int x = blockIdx.x / thred_lines;
|
|
||||||
|
|
||||||
int global_stride = (blockIdx.x % thred_lines) * stride;
|
|
||||||
int y = global_stride + tid;
|
|
||||||
|
|
||||||
// store global data in shared memory
|
|
||||||
if (x < src.cols && y < src.rows)
|
|
||||||
sbuf[tid] = src(y, x);
|
|
||||||
else
|
|
||||||
sbuf[tid] = 0;
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
scan_block<inclusive, W>(sbuf);
|
|
||||||
|
|
||||||
float scale = __fdividef(1.f, fx * fy);
|
|
||||||
int out_stride = global_stride / fx;
|
|
||||||
int count = blockDim.x / fx;
|
|
||||||
|
|
||||||
if (tid < count)
|
|
||||||
{
|
|
||||||
int start_idx = (tid == 0)? 0 : tid * fx - 1;
|
|
||||||
int end_idx = tid * fx + fx - 1;
|
|
||||||
|
|
||||||
W start = (tid == 0)? (W)0:sbuf[start_idx];
|
|
||||||
W end = sbuf[end_idx];
|
|
||||||
|
|
||||||
dst(out_stride + tid, x) = saturate_cast<T>((end - start) * scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,
|
|
||||||
int interpolation, DevMem2Df buffer, cudaStream_t stream)
|
|
||||||
{
|
|
||||||
(void)interpolation;
|
|
||||||
|
|
||||||
int iscale_x = round(fx);
|
|
||||||
int iscale_y = round(fy);
|
|
||||||
|
|
||||||
int warps = 4;
|
|
||||||
const int threads = 32 * warps;
|
|
||||||
int input_stride = threads / iscale_x;
|
|
||||||
|
|
||||||
int thred_lines = divUp(src.cols, input_stride * iscale_x);
|
|
||||||
int blocks = src.rows * thred_lines;
|
|
||||||
|
|
||||||
typedef typename scan_traits<T>::scan_line_type smem_type;
|
|
||||||
|
|
||||||
resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
|
|
||||||
(src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x);
|
|
||||||
|
|
||||||
input_stride = threads / iscale_y;
|
|
||||||
thred_lines = divUp(src.rows, input_stride * iscale_y);
|
|
||||||
blocks = dst.cols * thred_lines;
|
|
||||||
|
|
||||||
resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
|
|
||||||
(buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y);
|
|
||||||
|
|
||||||
cudaSafeCall( cudaGetLastError() );
|
|
||||||
|
|
||||||
if (stream == 0)
|
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
|
||||||
}
|
|
||||||
|
|
||||||
template void resize_area_gpu<uchar>(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream);
|
|
||||||
|
|
||||||
} // namespace imgproc
|
} // namespace imgproc
|
||||||
}}} // namespace cv { namespace gpu { namespace device
|
}}} // namespace cv { namespace gpu { namespace device
|
||||||
|
@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||||
{
|
{
|
||||||
dim3 blockDim(32, 8);
|
dim3 block(32, 8);
|
||||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||||
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
mergeC2_<T><<<grid, block, 0, stream>>>(
|
||||||
src[0].data, src[0].step,
|
src[0].data, src[0].step,
|
||||||
src[1].data, src[1].step,
|
src[1].data, src[1].step,
|
||||||
dst.rows, dst.cols, dst.data, dst.step);
|
dst.rows, dst.cols, dst.data, dst.step);
|
||||||
@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||||
{
|
{
|
||||||
dim3 blockDim(32, 8);
|
dim3 block(32, 8);
|
||||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||||
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
mergeC3_<T><<<grid, block, 0, stream>>>(
|
||||||
src[0].data, src[0].step,
|
src[0].data, src[0].step,
|
||||||
src[1].data, src[1].step,
|
src[1].data, src[1].step,
|
||||||
src[2].data, src[2].step,
|
src[2].data, src[2].step,
|
||||||
@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||||
{
|
{
|
||||||
dim3 blockDim(32, 8);
|
dim3 block(32, 8);
|
||||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||||
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
mergeC4_<T><<<grid, block, 0, stream>>>(
|
||||||
src[0].data, src[0].step,
|
src[0].data, src[0].step,
|
||||||
src[1].data, src[1].step,
|
src[1].data, src[1].step,
|
||||||
src[2].data, src[2].step,
|
src[2].data, src[2].step,
|
||||||
@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||||
{
|
{
|
||||||
dim3 blockDim(32, 8);
|
dim3 block(32, 8);
|
||||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||||
splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
splitC2_<T><<<grid, block, 0, stream>>>(
|
||||||
src.data, src.step, src.rows, src.cols,
|
src.data, src.step, src.rows, src.cols,
|
||||||
dst[0].data, dst[0].step,
|
dst[0].data, dst[0].step,
|
||||||
dst[1].data, dst[1].step);
|
dst[1].data, dst[1].step);
|
||||||
@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||||
{
|
{
|
||||||
dim3 blockDim(32, 8);
|
dim3 block(32, 8);
|
||||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||||
splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
splitC3_<T><<<grid, block, 0, stream>>>(
|
||||||
src.data, src.step, src.rows, src.cols,
|
src.data, src.step, src.rows, src.cols,
|
||||||
dst[0].data, dst[0].step,
|
dst[0].data, dst[0].step,
|
||||||
dst[1].data, dst[1].step,
|
dst[1].data, dst[1].step,
|
||||||
@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||||
{
|
{
|
||||||
dim3 blockDim(32, 8);
|
dim3 block(32, 8);
|
||||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||||
splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
splitC4_<T><<<grid, block, 0, stream>>>(
|
||||||
src.data, src.step, src.rows, src.cols,
|
src.data, src.step, src.rows, src.cols,
|
||||||
dst[0].data, dst[0].step,
|
dst[0].data, dst[0].step,
|
||||||
dst[1].data, dst[1].step,
|
dst[1].data, dst[1].step,
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||||
//
|
//
|
||||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||||
//
|
//
|
||||||
// By downloading, copying, installing or using the software you agree to this license.
|
// By downloading, copying, installing or using the software you agree to this license.
|
||||||
// If you do not agree to this license, do not download, install,
|
// If you do not agree to this license, do not download, install,
|
||||||
// copy or use the software.
|
// copy or use the software.
|
||||||
@ -129,9 +129,9 @@ texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
|
|||||||
__global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)
|
__global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)
|
||||||
{
|
{
|
||||||
const int pos = blockIdx.x*blockDim.x + threadIdx.x;
|
const int pos = blockIdx.x*blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
if(pos >= len) return;
|
if(pos >= len) return;
|
||||||
|
|
||||||
d_res[pos] = d_op1[pos] + d_op2[pos];
|
d_res[pos] = d_op1[pos] + d_op2[pos];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -265,7 +265,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo
|
|||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
template<int tex_id>
|
template<int tex_id>
|
||||||
__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
|
__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
|
||||||
{
|
{
|
||||||
//position within shared memory array
|
//position within shared memory array
|
||||||
const int ijs = js * PSOR_PITCH + is;
|
const int ijs = js * PSOR_PITCH + is;
|
||||||
//mirror reflection across borders
|
//mirror reflection across borders
|
||||||
@ -299,7 +299,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
|
|||||||
///\param h number of rows in global memory array
|
///\param h number of rows in global memory array
|
||||||
///\param p global memory array pitch in floats
|
///\param p global memory array pitch in floats
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
template<int tex>
|
template<int tex>
|
||||||
__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
|
__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
|
||||||
{
|
{
|
||||||
const int i = threadIdx.x + 2;
|
const int i = threadIdx.x + 2;
|
||||||
@ -381,7 +381,7 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
|
|||||||
/// \param gamma (in) gamma in Brox model (edge importance)
|
/// \param gamma (in) gamma in Brox model (edge importance)
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
|
__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
|
||||||
float *denominator_u, float *denominator_v,
|
float *denominator_u, float *denominator_v,
|
||||||
float *numerator_dudv,
|
float *numerator_dudv,
|
||||||
float *numerator_u, float *numerator_v,
|
float *numerator_u, float *numerator_v,
|
||||||
@ -532,16 +532,16 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
|
|||||||
// Red-Black SOR
|
// Red-Black SOR
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<int isBlack> __global__ void sor_pass(float *new_du,
|
template<int isBlack> __global__ void sor_pass(float *new_du,
|
||||||
float *new_dv,
|
float *new_dv,
|
||||||
const float *g_inv_denominator_u,
|
const float *g_inv_denominator_u,
|
||||||
const float *g_inv_denominator_v,
|
const float *g_inv_denominator_v,
|
||||||
const float *g_numerator_u,
|
const float *g_numerator_u,
|
||||||
const float *g_numerator_v,
|
const float *g_numerator_v,
|
||||||
const float *g_numerator_dudv,
|
const float *g_numerator_dudv,
|
||||||
float omega,
|
float omega,
|
||||||
int width,
|
int width,
|
||||||
int height,
|
int height,
|
||||||
int stride)
|
int stride)
|
||||||
{
|
{
|
||||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
@ -604,7 +604,7 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
|
|||||||
if((i+j)%2 == isBlack)
|
if((i+j)%2 == isBlack)
|
||||||
{
|
{
|
||||||
// update du
|
// update du
|
||||||
float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -
|
float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -
|
||||||
u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);
|
u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);
|
||||||
|
|
||||||
du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;
|
du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;
|
||||||
@ -644,7 +644,7 @@ void InitTextures()
|
|||||||
initTexture2D(tex_I1);
|
initTexture2D(tex_I1);
|
||||||
initTexture2D(tex_fine); // for downsampling
|
initTexture2D(tex_fine); // for downsampling
|
||||||
initTexture2D(tex_coarse); // for prolongation
|
initTexture2D(tex_coarse); // for prolongation
|
||||||
|
|
||||||
initTexture2D(tex_Ix);
|
initTexture2D(tex_Ix);
|
||||||
initTexture2D(tex_Ixx);
|
initTexture2D(tex_Ixx);
|
||||||
initTexture2D(tex_Ix0);
|
initTexture2D(tex_Ix0);
|
||||||
@ -725,7 +725,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
const Ncv32u kSourceHeight = frame0.height();
|
const Ncv32u kSourceHeight = frame0.height();
|
||||||
|
|
||||||
ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
|
ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
|
||||||
ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
|
ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
|
||||||
uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
|
uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
|
||||||
|
|
||||||
ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
|
ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
|
||||||
@ -780,7 +780,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);
|
SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);
|
||||||
|
|
||||||
// temporary storage
|
// temporary storage
|
||||||
SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
|
SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
|
||||||
alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));
|
alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));
|
||||||
|
|
||||||
// image derivatives
|
// image derivatives
|
||||||
@ -800,7 +800,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
{
|
{
|
||||||
const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
|
const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
|
||||||
|
|
||||||
ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
|
ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
|
||||||
cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
|
cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
|
||||||
|
|
||||||
InitTextures();
|
InitTextures();
|
||||||
@ -827,10 +827,10 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
size_t src_width_in_bytes = kSourceWidth * sizeof(float);
|
size_t src_width_in_bytes = kSourceWidth * sizeof(float);
|
||||||
size_t src_pitch_in_bytes = frame0.pitch();
|
size_t src_pitch_in_bytes = frame0.pitch();
|
||||||
|
|
||||||
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),
|
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),
|
||||||
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
|
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
|
||||||
|
|
||||||
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),
|
ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),
|
||||||
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
|
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -876,11 +876,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
NcvRect32u dstROI (0, 0, level_width, level_height);
|
NcvRect32u dstROI (0, 0, level_width, level_height);
|
||||||
|
|
||||||
// frame 0
|
// frame 0
|
||||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
|
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
|
||||||
level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
|
level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
|
||||||
|
|
||||||
// frame 1
|
// frame 1
|
||||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
|
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
|
||||||
level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
|
level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -956,14 +956,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
dim3 dThreads(32, 6);
|
dim3 dThreads(32, 6);
|
||||||
|
|
||||||
const int kPitchTex = kLevelStride * sizeof(float);
|
const int kPitchTex = kLevelStride * sizeof(float);
|
||||||
|
|
||||||
NcvSize32u srcSize(kLevelWidth, kLevelHeight);
|
NcvSize32u srcSize(kLevelWidth, kLevelHeight);
|
||||||
Ncv32u nSrcStep = kLevelStride * sizeof(float);
|
Ncv32u nSrcStep = kLevelStride * sizeof(float);
|
||||||
NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
|
NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
|
||||||
|
|
||||||
// Ix0
|
// Ix0
|
||||||
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
|
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
|
||||||
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
|
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
|
||||||
|
|
||||||
// Iy0
|
// Iy0
|
||||||
ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
|
ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
|
||||||
@ -987,8 +987,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
|
|
||||||
// Ixy
|
// Ixy
|
||||||
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
|
ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
|
||||||
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
|
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
|
||||||
|
|
||||||
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
|
||||||
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
|
||||||
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
|
||||||
@ -1017,21 +1017,21 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
{
|
{
|
||||||
//compute coefficients
|
//compute coefficients
|
||||||
prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
|
prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
|
||||||
(diffusivity_x.ptr(),
|
(diffusivity_x.ptr(),
|
||||||
diffusivity_y.ptr(),
|
diffusivity_y.ptr(),
|
||||||
denom_u.ptr(),
|
denom_u.ptr(),
|
||||||
denom_v.ptr(),
|
denom_v.ptr(),
|
||||||
num_dudv.ptr(),
|
num_dudv.ptr(),
|
||||||
num_u.ptr(),
|
num_u.ptr(),
|
||||||
num_v.ptr(),
|
num_v.ptr(),
|
||||||
kLevelWidth,
|
kLevelWidth,
|
||||||
kLevelHeight,
|
kLevelHeight,
|
||||||
kLevelStride,
|
kLevelStride,
|
||||||
alpha,
|
alpha,
|
||||||
gamma);
|
gamma);
|
||||||
|
|
||||||
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
||||||
|
|
||||||
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
||||||
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
||||||
|
|
||||||
@ -1043,7 +1043,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
|
prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
|
||||||
|
|
||||||
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
||||||
|
|
||||||
// linear system coefficients
|
// linear system coefficients
|
||||||
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
||||||
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
||||||
@ -1055,26 +1055,26 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
|
|
||||||
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
||||||
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
||||||
|
|
||||||
//solve linear system
|
//solve linear system
|
||||||
for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
|
for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
|
||||||
{
|
{
|
||||||
float omega = 1.99f;
|
float omega = 1.99f;
|
||||||
|
|
||||||
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
||||||
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
||||||
|
|
||||||
sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
|
sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
|
||||||
(du_new.ptr(),
|
(du_new.ptr(),
|
||||||
dv_new.ptr(),
|
dv_new.ptr(),
|
||||||
denom_u.ptr(),
|
denom_u.ptr(),
|
||||||
denom_v.ptr(),
|
denom_v.ptr(),
|
||||||
num_u.ptr(),
|
num_u.ptr(),
|
||||||
num_v.ptr(),
|
num_v.ptr(),
|
||||||
num_dudv.ptr(),
|
num_dudv.ptr(),
|
||||||
omega,
|
omega,
|
||||||
kLevelWidth,
|
kLevelWidth,
|
||||||
kLevelHeight,
|
kLevelHeight,
|
||||||
kLevelStride);
|
kLevelStride);
|
||||||
|
|
||||||
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
||||||
@ -1083,16 +1083,16 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
|
||||||
|
|
||||||
sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
|
sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
|
||||||
(du.ptr(),
|
(du.ptr(),
|
||||||
dv.ptr(),
|
dv.ptr(),
|
||||||
denom_u.ptr(),
|
denom_u.ptr(),
|
||||||
denom_v.ptr(),
|
denom_v.ptr(),
|
||||||
num_u.ptr(),
|
num_u.ptr(),
|
||||||
num_v.ptr(),
|
num_v.ptr(),
|
||||||
num_dudv.ptr(),
|
num_dudv.ptr(),
|
||||||
omega,
|
omega,
|
||||||
kLevelWidth,
|
kLevelWidth,
|
||||||
kLevelHeight,
|
kLevelHeight,
|
||||||
kLevelStride);
|
kLevelStride);
|
||||||
|
|
||||||
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
||||||
@ -1120,19 +1120,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
|
|
||||||
dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
|
dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
|
||||||
dim3 p_threads(32, 8);
|
dim3 p_threads(32, 8);
|
||||||
|
|
||||||
NcvSize32u srcSize (kLevelWidth, kLevelHeight);
|
NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight);
|
||||||
NcvSize32u dstSize (nw, nh);
|
NcvSize32u dstSize (nw, nh);
|
||||||
NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
|
NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
|
||||||
NcvRect32u dstROI (0, 0, nw, nh);
|
NcvRect32u dstROI (0, 0, nw, nh);
|
||||||
|
|
||||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
|
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
|
||||||
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
|
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
|
||||||
|
|
||||||
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
||||||
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
||||||
|
|
||||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
|
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
|
||||||
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
|
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
|
||||||
|
|
||||||
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
||||||
@ -1148,11 +1148,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
|||||||
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
|
||||||
|
|
||||||
ncvAssertCUDAReturn( cudaMemcpy2DAsync
|
ncvAssertCUDAReturn( cudaMemcpy2DAsync
|
||||||
(uOut.ptr(), uOut.pitch(), ptrU->ptr(),
|
(uOut.ptr(), uOut.pitch(), ptrU->ptr(),
|
||||||
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
|
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
|
||||||
|
|
||||||
ncvAssertCUDAReturn( cudaMemcpy2DAsync
|
ncvAssertCUDAReturn( cudaMemcpy2DAsync
|
||||||
(vOut.ptr(), vOut.pitch(), ptrV->ptr(),
|
(vOut.ptr(), vOut.pitch(), ptrV->ptr(),
|
||||||
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
|
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
|
||||||
|
|
||||||
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
|
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
|
||||||
|
@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
|
|||||||
//===================================================================
|
//===================================================================
|
||||||
|
|
||||||
|
|
||||||
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
|
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_)
|
||||||
:
|
:
|
||||||
currentSize(0),
|
currentSize(0),
|
||||||
_maxSize(0),
|
_maxSize(0),
|
||||||
@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
|
|||||||
begin(NULL),
|
begin(NULL),
|
||||||
end(NULL),
|
end(NULL),
|
||||||
_memType(NCVMemoryTypeNone),
|
_memType(NCVMemoryTypeNone),
|
||||||
_alignment(alignment),
|
_alignment(alignment_),
|
||||||
bReusesMemory(false)
|
bReusesMemory(false)
|
||||||
{
|
{
|
||||||
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
|
NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
|
||||||
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
|
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
|
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr)
|
||||||
:
|
:
|
||||||
currentSize(0),
|
currentSize(0),
|
||||||
_maxSize(0),
|
_maxSize(0),
|
||||||
allocBegin(NULL),
|
allocBegin(NULL),
|
||||||
_memType(memT),
|
_memType(memT),
|
||||||
_alignment(alignment)
|
_alignment(alignment_)
|
||||||
{
|
{
|
||||||
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
|
NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
|
||||||
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
|
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
|
||||||
ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
|
ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
|
||||||
|
|
||||||
@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const
|
|||||||
//===================================================================
|
//===================================================================
|
||||||
|
|
||||||
|
|
||||||
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
|
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_)
|
||||||
:
|
:
|
||||||
currentSize(0),
|
currentSize(0),
|
||||||
_maxSize(0),
|
_maxSize(0),
|
||||||
_memType(memT),
|
_memType(memT),
|
||||||
_alignment(alignment)
|
_alignment(alignment_)
|
||||||
{
|
{
|
||||||
ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
|
ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
|
||||||
}
|
}
|
||||||
|
@ -64,7 +64,7 @@
|
|||||||
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__)
|
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace cv { namespace gpu
|
namespace cv { namespace gpu
|
||||||
{
|
{
|
||||||
void error(const char *error_string, const char *file, const int line, const char *func);
|
void error(const char *error_string, const char *file, const int line, const char *func);
|
||||||
|
|
||||||
@ -87,14 +87,14 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
|
|||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
|
|
||||||
namespace cv { namespace gpu
|
namespace cv { namespace gpu
|
||||||
{
|
{
|
||||||
__host__ __device__ __forceinline__ int divUp(int total, int grain)
|
__host__ __device__ __forceinline__ int divUp(int total, int grain)
|
||||||
{
|
{
|
||||||
return (total + grain - 1) / grain;
|
return (total + grain - 1) / grain;
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace device
|
namespace device
|
||||||
{
|
{
|
||||||
typedef unsigned char uchar;
|
typedef unsigned char uchar;
|
||||||
typedef unsigned short ushort;
|
typedef unsigned short ushort;
|
||||||
|
@ -45,7 +45,7 @@
|
|||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
|
||||||
|
|
||||||
@ -54,13 +54,13 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
__device__ __forceinline__ static void Load(const T* ptr, int offset, T& val) { val = ptr[offset]; }
|
__device__ __forceinline__ static void Load(const T* ptr, int offset, T& val) { val = ptr[offset]; }
|
||||||
};
|
};
|
||||||
|
|
||||||
#else // __CUDA_ARCH__ >= 200
|
|
||||||
|
|
||||||
#if defined(_WIN64) || defined(__LP64__)
|
#else // __CUDA_ARCH__ >= 200
|
||||||
|
|
||||||
|
#if defined(_WIN64) || defined(__LP64__)
|
||||||
// 64-bit register modifier for inlined asm
|
// 64-bit register modifier for inlined asm
|
||||||
#define OPENCV_GPU_ASM_PTR "l"
|
#define OPENCV_GPU_ASM_PTR "l"
|
||||||
#else
|
#else
|
||||||
// 32-bit register modifier for inlined asm
|
// 32-bit register modifier for inlined asm
|
||||||
#define OPENCV_GPU_ASM_PTR "r"
|
#define OPENCV_GPU_ASM_PTR "r"
|
||||||
#endif
|
#endif
|
||||||
@ -84,21 +84,21 @@ namespace cv { namespace gpu { namespace device
|
|||||||
asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
|
asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8)
|
OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8)
|
||||||
OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8)
|
OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8)
|
||||||
OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8)
|
OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8)
|
||||||
OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h)
|
OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h)
|
||||||
OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h)
|
OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h)
|
||||||
OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r)
|
OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r)
|
||||||
OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r)
|
OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r)
|
||||||
OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f)
|
OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f)
|
||||||
OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d)
|
OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d)
|
||||||
|
|
||||||
#undef OPENCV_GPU_DEFINE_FORCE_GLOB
|
#undef OPENCV_GPU_DEFINE_FORCE_GLOB
|
||||||
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
|
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
|
||||||
#undef OPENCV_GPU_ASM_PTR
|
#undef OPENCV_GPU_ASM_PTR
|
||||||
|
|
||||||
#endif // __CUDA_ARCH__ >= 200
|
#endif // __CUDA_ARCH__ >= 200
|
||||||
}}} // namespace cv { namespace gpu { namespace device
|
}}} // namespace cv { namespace gpu { namespace device
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@
|
|||||||
#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
|
#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
template<class T> struct DynamicSharedMem
|
template<class T> struct DynamicSharedMem
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ operator T*()
|
__device__ __forceinline__ operator T*()
|
||||||
|
@ -45,21 +45,21 @@
|
|||||||
|
|
||||||
#include "warp_reduce.hpp"
|
#include "warp_reduce.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
struct Emulation
|
struct Emulation
|
||||||
{
|
{
|
||||||
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
|
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
|
||||||
{
|
{
|
||||||
#if __CUDA_ARCH__ >= 200
|
#if __CUDA_ARCH__ >= 200
|
||||||
(void)cta_buffer;
|
(void)cta_buffer;
|
||||||
return __ballot(predicate);
|
return __ballot(predicate);
|
||||||
#else
|
#else
|
||||||
int tid = threadIdx.x;
|
int tid = threadIdx.x;
|
||||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||||
return warp_reduce(cta_buffer);
|
return warp_reduce(cta_buffer);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}}} // namespace cv { namespace gpu { namespace device
|
}}} // namespace cv { namespace gpu { namespace device
|
||||||
|
|
||||||
|
@ -46,14 +46,14 @@
|
|||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
template<class Func>
|
template<class Func>
|
||||||
void printFuncAttrib(Func& func)
|
void printFuncAttrib(Func& func)
|
||||||
{
|
{
|
||||||
|
|
||||||
cudaFuncAttributes attrs;
|
cudaFuncAttributes attrs;
|
||||||
cudaFuncGetAttributes(&attrs, func);
|
cudaFuncGetAttributes(&attrs, func);
|
||||||
|
|
||||||
printf("=== Function stats ===\n");
|
printf("=== Function stats ===\n");
|
||||||
printf("Name: \n");
|
printf("Name: \n");
|
||||||
@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
printf("ptxVersion = %d\n", attrs.ptxVersion);
|
printf("ptxVersion = %d\n", attrs.ptxVersion);
|
||||||
printf("binaryVersion = %d\n", attrs.binaryVersion);
|
printf("binaryVersion = %d\n", attrs.binaryVersion);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
}}} // namespace cv { namespace gpu { namespace device
|
}}} // namespace cv { namespace gpu { namespace device
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@
|
|||||||
#include "vec_traits.hpp"
|
#include "vec_traits.hpp"
|
||||||
#include "type_traits.hpp"
|
#include "type_traits.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
// Function Objects
|
// Function Objects
|
||||||
|
|
||||||
@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
template <typename T> struct bit_not : unary_function<T, T>
|
template <typename T> struct bit_not : unary_function<T, T>
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
|
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
|
||||||
{
|
{
|
||||||
return ~v;
|
return ~v;
|
||||||
}
|
}
|
||||||
@ -268,7 +268,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
// Generalized Identity Operations
|
// Generalized Identity Operations
|
||||||
template <typename T> struct identity : unary_function<T, T>
|
template <typename T> struct identity : unary_function<T, T>
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
|
__device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
|
||||||
{
|
{
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
|
template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
__device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
||||||
{
|
{
|
||||||
return lhs;
|
return lhs;
|
||||||
}
|
}
|
||||||
@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
|
template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
__device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
||||||
{
|
{
|
||||||
return rhs;
|
return rhs;
|
||||||
}
|
}
|
||||||
@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
template <typename T> struct maximum : binary_function<T, T, T>
|
template <typename T> struct maximum : binary_function<T, T, T>
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||||
{
|
{
|
||||||
return lhs < rhs ? rhs : lhs;
|
return lhs < rhs ? rhs : lhs;
|
||||||
}
|
}
|
||||||
@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
template <typename T> struct minimum : binary_function<T, T, T>
|
template <typename T> struct minimum : binary_function<T, T, T>
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||||
{
|
{
|
||||||
return lhs < rhs ? lhs : rhs;
|
return lhs < rhs ? lhs : rhs;
|
||||||
}
|
}
|
||||||
@ -410,12 +410,14 @@ namespace cv { namespace gpu { namespace device
|
|||||||
#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
|
#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
|
||||||
#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
|
#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
|
||||||
|
|
||||||
template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
|
template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
|
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
|
||||||
{
|
{
|
||||||
return src1 * src1 + src2 * src2;
|
return src1 * src1 + src2 * src2;
|
||||||
}
|
}
|
||||||
|
__device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
|
||||||
|
__device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Saturate Cast Functor
|
// Saturate Cast Functor
|
||||||
@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
return (src > thresh) * maxVal;
|
return (src > thresh) * maxVal;
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
|
__device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
|
||||||
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
|
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
|
||||||
|
|
||||||
@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
return (src <= thresh) * maxVal;
|
return (src <= thresh) * maxVal;
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
|
__device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
|
||||||
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
|
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
|
||||||
|
|
||||||
@ -519,12 +523,16 @@ namespace cv { namespace gpu { namespace device
|
|||||||
explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
|
explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
|
||||||
|
|
||||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
|
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
|
||||||
{
|
{
|
||||||
return !pred(x);
|
return !pred(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
|
||||||
|
__device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
|
||||||
|
|
||||||
const Predicate pred;
|
const Predicate pred;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
|
template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
|
||||||
{
|
{
|
||||||
return unary_negate<Predicate>(pred);
|
return unary_negate<Predicate>(pred);
|
||||||
@ -534,19 +542,26 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
|
explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
|
||||||
|
|
||||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
|
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,
|
||||||
{
|
typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
|
||||||
return !pred(x,y);
|
{
|
||||||
|
return !pred(x,y);
|
||||||
}
|
}
|
||||||
|
__device__ __forceinline__ binary_negate(const binary_negate& other)
|
||||||
|
: binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
|
||||||
|
|
||||||
|
__device__ __forceinline__ binary_negate() :
|
||||||
|
binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
|
||||||
|
|
||||||
const Predicate pred;
|
const Predicate pred;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
|
template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
|
||||||
{
|
{
|
||||||
return binary_negate<BinaryPredicate>(pred);
|
return binary_negate<BinaryPredicate>(pred);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
|
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
|
||||||
{
|
{
|
||||||
__host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
|
__host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
|
||||||
|
|
||||||
@ -555,15 +570,19 @@ namespace cv { namespace gpu { namespace device
|
|||||||
return op(arg1, a);
|
return op(arg1, a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ binder1st(const binder1st& other) :
|
||||||
|
unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
|
||||||
|
|
||||||
const Op op;
|
const Op op;
|
||||||
const typename Op::first_argument_type arg1;
|
const typename Op::first_argument_type arg1;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
|
template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
|
||||||
{
|
{
|
||||||
return binder1st<Op>(op, typename Op::first_argument_type(x));
|
return binder1st<Op>(op, typename Op::first_argument_type(x));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
|
template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
|
||||||
{
|
{
|
||||||
__host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
|
__host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
|
||||||
|
|
||||||
@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device
|
|||||||
return op(a, arg2);
|
return op(a, arg2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ binder2nd(const binder2nd& other) :
|
||||||
|
unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
|
||||||
|
|
||||||
const Op op;
|
const Op op;
|
||||||
const typename Op::second_argument_type arg2;
|
const typename Op::second_argument_type arg2;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
|
template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
|
||||||
{
|
{
|
||||||
return binder2nd<Op>(op, typename Op::second_argument_type(x));
|
return binder2nd<Op>(op, typename Op::second_argument_type(x));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Functor Traits
|
// Functor Traits
|
||||||
|
|
||||||
template <typename F> struct IsUnaryFunction
|
template <typename F> struct IsUnaryFunction
|
||||||
{
|
{
|
||||||
typedef char Yes;
|
typedef char Yes;
|
||||||
@ -618,7 +640,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
|
enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
|
||||||
};
|
};
|
||||||
|
|
||||||
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
|
template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
|
||||||
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
|
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
|
||||||
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
|
template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
|
||||||
|
@ -46,7 +46,7 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
template<class T> struct numeric_limits
|
template<class T> struct numeric_limits
|
||||||
{
|
{
|
||||||
|
@ -57,35 +57,35 @@ namespace cv { namespace gpu { namespace device
|
|||||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
|
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
|
||||||
|
|
||||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
|
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
|
||||||
{
|
{
|
||||||
return (uchar) ::max((int)v, 0);
|
return (uchar) ::max((int)v, 0);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
|
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
|
||||||
{
|
{
|
||||||
return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
|
return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
|
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
|
||||||
{
|
{
|
||||||
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
|
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
|
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
|
||||||
{
|
{
|
||||||
return (uchar) ::min(v, (uint)UCHAR_MAX);
|
return (uchar) ::min(v, (uint)UCHAR_MAX);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
|
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
|
||||||
{
|
{
|
||||||
return saturate_cast<uchar>((uint)v);
|
return saturate_cast<uchar>((uint)v);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
|
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
|
||||||
{
|
{
|
||||||
int iv = __float2int_rn(v);
|
int iv = __float2int_rn(v);
|
||||||
return saturate_cast<uchar>(iv);
|
return saturate_cast<uchar>(iv);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
|
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
|
||||||
{
|
{
|
||||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||||
int iv = __double2int_rn(v);
|
int iv = __double2int_rn(v);
|
||||||
return saturate_cast<uchar>(iv);
|
return saturate_cast<uchar>(iv);
|
||||||
#else
|
#else
|
||||||
return saturate_cast<uchar>((float)v);
|
return saturate_cast<uchar>((float)v);
|
||||||
@ -93,35 +93,35 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
|
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
|
||||||
{
|
{
|
||||||
return (schar) ::min((int)v, SCHAR_MAX);
|
return (schar) ::min((int)v, SCHAR_MAX);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
|
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
|
||||||
{
|
{
|
||||||
return (schar) ::min((uint)v, (uint)SCHAR_MAX);
|
return (schar) ::min((uint)v, (uint)SCHAR_MAX);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
|
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
|
||||||
{
|
{
|
||||||
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
|
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
|
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
|
||||||
{
|
{
|
||||||
return saturate_cast<schar>((int)v);
|
return saturate_cast<schar>((int)v);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
|
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
|
||||||
{
|
{
|
||||||
return (schar) ::min(v, (uint)SCHAR_MAX);
|
return (schar) ::min(v, (uint)SCHAR_MAX);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
|
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
|
||||||
{
|
{
|
||||||
int iv = __float2int_rn(v);
|
int iv = __float2int_rn(v);
|
||||||
return saturate_cast<schar>(iv);
|
return saturate_cast<schar>(iv);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
|
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
|
||||||
{
|
{
|
||||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||||
int iv = __double2int_rn(v);
|
int iv = __double2int_rn(v);
|
||||||
return saturate_cast<schar>(iv);
|
return saturate_cast<schar>(iv);
|
||||||
#else
|
#else
|
||||||
return saturate_cast<schar>((float)v);
|
return saturate_cast<schar>((float)v);
|
||||||
@ -129,30 +129,30 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
|
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
|
||||||
{
|
{
|
||||||
return (ushort) ::max((int)v, 0);
|
return (ushort) ::max((int)v, 0);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
|
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
|
||||||
{
|
{
|
||||||
return (ushort) ::max((int)v, 0);
|
return (ushort) ::max((int)v, 0);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
|
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
|
||||||
{
|
{
|
||||||
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
|
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
|
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
|
||||||
{
|
{
|
||||||
return (ushort) ::min(v, (uint)USHRT_MAX);
|
return (ushort) ::min(v, (uint)USHRT_MAX);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
|
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
|
||||||
{
|
{
|
||||||
int iv = __float2int_rn(v);
|
int iv = __float2int_rn(v);
|
||||||
return saturate_cast<ushort>(iv);
|
return saturate_cast<ushort>(iv);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
|
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
|
||||||
{
|
{
|
||||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||||
int iv = __double2int_rn(v);
|
int iv = __double2int_rn(v);
|
||||||
return saturate_cast<ushort>(iv);
|
return saturate_cast<ushort>(iv);
|
||||||
#else
|
#else
|
||||||
return saturate_cast<ushort>((float)v);
|
return saturate_cast<ushort>((float)v);
|
||||||
@ -160,37 +160,37 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
|
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
|
||||||
{
|
{
|
||||||
return (short) ::min((int)v, SHRT_MAX);
|
return (short) ::min((int)v, SHRT_MAX);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ short saturate_cast<short>(int v)
|
template<> __device__ __forceinline__ short saturate_cast<short>(int v)
|
||||||
{
|
{
|
||||||
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
|
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
|
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
|
||||||
{
|
{
|
||||||
return (short) ::min(v, (uint)SHRT_MAX);
|
return (short) ::min(v, (uint)SHRT_MAX);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ short saturate_cast<short>(float v)
|
template<> __device__ __forceinline__ short saturate_cast<short>(float v)
|
||||||
{
|
{
|
||||||
int iv = __float2int_rn(v);
|
int iv = __float2int_rn(v);
|
||||||
return saturate_cast<short>(iv);
|
return saturate_cast<short>(iv);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
|
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
|
||||||
{
|
{
|
||||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||||
int iv = __double2int_rn(v);
|
int iv = __double2int_rn(v);
|
||||||
return saturate_cast<short>(iv);
|
return saturate_cast<short>(iv);
|
||||||
#else
|
#else
|
||||||
return saturate_cast<short>((float)v);
|
return saturate_cast<short>((float)v);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ __forceinline__ int saturate_cast<int>(float v)
|
template<> __device__ __forceinline__ int saturate_cast<int>(float v)
|
||||||
{
|
{
|
||||||
return __float2int_rn(v);
|
return __float2int_rn(v);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ int saturate_cast<int>(double v)
|
template<> __device__ __forceinline__ int saturate_cast<int>(double v)
|
||||||
{
|
{
|
||||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||||
return __double2int_rn(v);
|
return __double2int_rn(v);
|
||||||
@ -200,11 +200,11 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
|
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
|
||||||
{
|
{
|
||||||
return __float2uint_rn(v);
|
return __float2uint_rn(v);
|
||||||
}
|
}
|
||||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
|
template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
|
||||||
{
|
{
|
||||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
|
||||||
return __double2uint_rn(v);
|
return __double2uint_rn(v);
|
||||||
#else
|
#else
|
||||||
|
166
modules/gpu/src/opencv2/gpu/device/scan.hpp
Normal file
166
modules/gpu/src/opencv2/gpu/device/scan.hpp
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||||
|
//
|
||||||
|
// By downloading, copying, installing or using the software you agree to this license.
|
||||||
|
// If you do not agree to this license, do not download, install,
|
||||||
|
// copy or use the software.
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// License Agreement
|
||||||
|
// For Open Source Computer Vision Library
|
||||||
|
//
|
||||||
|
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||||
|
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||||
|
// Third party copyrights are property of their respective owners.
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
// are permitted provided that the following conditions are met:
|
||||||
|
//
|
||||||
|
// * Redistribution's of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer in the documentation
|
||||||
|
// and/or other materials provided with the distribution.
|
||||||
|
//
|
||||||
|
// * The name of the copyright holders may not be used to endorse or promote products
|
||||||
|
// derived from this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// This software is provided by the copyright holders and contributors "as is" and
|
||||||
|
// any express or implied warranties, including, but not limited to, the implied
|
||||||
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||||
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||||
|
// indirect, incidental, special, exemplary, or consequential damages
|
||||||
|
// (including, but not limited to, procurement of substitute goods or services;
|
||||||
|
// loss of use, data, or profits; or business interruption) however caused
|
||||||
|
// and on any theory of liability, whether in contract, strict liability,
|
||||||
|
// or tort (including negligence or otherwise) arising in any way out of
|
||||||
|
// the use of this software, even if advised of the possibility of such damage.
|
||||||
|
//
|
||||||
|
//M*/
|
||||||
|
|
||||||
|
#ifndef __OPENCV_GPU_SCAN_HPP__
|
||||||
|
#define __OPENCV_GPU_SCAN_HPP__
|
||||||
|
|
||||||
|
enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
|
||||||
|
|
||||||
|
template <ScanKind Kind, typename T, typename F> struct WarpScan
|
||||||
|
{
|
||||||
|
__device__ __forceinline__ WarpScan() {}
|
||||||
|
__device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
|
||||||
|
|
||||||
|
__device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
|
||||||
|
{
|
||||||
|
const unsigned int lane = idx & 31;
|
||||||
|
F op;
|
||||||
|
|
||||||
|
if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
|
||||||
|
if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
|
||||||
|
if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
|
||||||
|
if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
|
||||||
|
if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
|
||||||
|
|
||||||
|
if( Kind == INCLUSIVE )
|
||||||
|
return ptr [idx];
|
||||||
|
else
|
||||||
|
return (lane > 0) ? ptr [idx - 1] : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ unsigned int index(const unsigned int tid)
|
||||||
|
{
|
||||||
|
return tid;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void init(volatile T *ptr){}
|
||||||
|
|
||||||
|
static const int warp_offset = 0;
|
||||||
|
|
||||||
|
typedef WarpScan<INCLUSIVE, T, F> merge;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
|
||||||
|
{
|
||||||
|
__device__ __forceinline__ WarpScanNoComp() {}
|
||||||
|
__device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
|
||||||
|
|
||||||
|
__device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
|
||||||
|
{
|
||||||
|
const unsigned int lane = threadIdx.x & 31;
|
||||||
|
F op;
|
||||||
|
|
||||||
|
ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
|
||||||
|
ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
|
||||||
|
ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
|
||||||
|
ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
|
||||||
|
ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
|
||||||
|
|
||||||
|
if( Kind == INCLUSIVE )
|
||||||
|
return ptr [idx];
|
||||||
|
else
|
||||||
|
return (lane > 0) ? ptr [idx - 1] : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ unsigned int index(const unsigned int tid)
|
||||||
|
{
|
||||||
|
return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void init(volatile T *ptr)
|
||||||
|
{
|
||||||
|
ptr[threadIdx.x] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const int warp_smem_stride = 32 + 16 + 1;
|
||||||
|
static const int warp_offset = 16;
|
||||||
|
static const int warp_log = 5;
|
||||||
|
static const int warp_mask = 31;
|
||||||
|
|
||||||
|
typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
|
||||||
|
{
|
||||||
|
__device__ __forceinline__ BlockScan() {}
|
||||||
|
__device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
|
||||||
|
|
||||||
|
__device__ __forceinline__ T operator()(volatile T *ptr)
|
||||||
|
{
|
||||||
|
const unsigned int tid = threadIdx.x;
|
||||||
|
const unsigned int lane = tid & warp_mask;
|
||||||
|
const unsigned int warp = tid >> warp_log;
|
||||||
|
|
||||||
|
Sc scan;
|
||||||
|
typename Sc::merge merge_scan;
|
||||||
|
const unsigned int idx = scan.index(tid);
|
||||||
|
|
||||||
|
T val = scan(ptr, idx);
|
||||||
|
__syncthreads ();
|
||||||
|
|
||||||
|
if( warp == 0)
|
||||||
|
scan.init(ptr);
|
||||||
|
__syncthreads ();
|
||||||
|
|
||||||
|
if( lane == 31 )
|
||||||
|
ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
|
||||||
|
__syncthreads ();
|
||||||
|
|
||||||
|
if( warp == 0 )
|
||||||
|
merge_scan(ptr, idx);
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if ( warp > 0)
|
||||||
|
val = ptr [scan.warp_offset + warp - 1] + val;
|
||||||
|
__syncthreads ();
|
||||||
|
|
||||||
|
ptr[idx] = val;
|
||||||
|
__syncthreads ();
|
||||||
|
|
||||||
|
return val ;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const int warp_log = 5;
|
||||||
|
static const int warp_mask = 31;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
@ -43,27 +43,27 @@
|
|||||||
#ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
|
#ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
|
||||||
#define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
|
#define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
|
||||||
|
|
||||||
#if defined(__CUDACC__)
|
#if defined(__CUDACC__)
|
||||||
#define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
|
#define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
|
||||||
#else
|
#else
|
||||||
#define __OPENCV_GPU_HOST_DEVICE__
|
#define __OPENCV_GPU_HOST_DEVICE__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace cv { namespace gpu
|
namespace cv { namespace gpu
|
||||||
{
|
{
|
||||||
namespace device
|
namespace device
|
||||||
{
|
{
|
||||||
template<bool expr> struct Static {};
|
template<bool expr> struct Static {};
|
||||||
|
|
||||||
template<> struct Static<true>
|
template<> struct Static<true>
|
||||||
{
|
{
|
||||||
__OPENCV_GPU_HOST_DEVICE__ static void check() {};
|
__OPENCV_GPU_HOST_DEVICE__ static void check() {};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
using ::cv::gpu::device::Static;
|
using ::cv::gpu::device::Static;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
#undef __OPENCV_GPU_HOST_DEVICE__
|
#undef __OPENCV_GPU_HOST_DEVICE__
|
||||||
|
|
||||||
#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
|
#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
|
@ -47,7 +47,7 @@
|
|||||||
#include "utility.hpp"
|
#include "utility.hpp"
|
||||||
#include "detail/transform_detail.hpp"
|
#include "detail/transform_detail.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
template <typename T, typename D, typename UnOp, typename Mask>
|
template <typename T, typename D, typename UnOp, typename Mask>
|
||||||
static inline void transform(DevMem2D_<T> src, DevMem2D_<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
|
static inline void transform(DevMem2D_<T> src, DevMem2D_<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
|
||||||
|
@ -45,11 +45,11 @@
|
|||||||
|
|
||||||
#include "detail/type_traits_detail.hpp"
|
#include "detail/type_traits_detail.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
template <typename T> struct IsSimpleParameter
|
template <typename T> struct IsSimpleParameter
|
||||||
{
|
{
|
||||||
enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
|
enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
|
||||||
type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
|
type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -65,16 +65,16 @@ namespace cv { namespace gpu { namespace device
|
|||||||
enum { isVolatile = type_traits_detail::UnVolatile<T>::value };
|
enum { isVolatile = type_traits_detail::UnVolatile<T>::value };
|
||||||
|
|
||||||
enum { isReference = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
|
enum { isReference = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
|
||||||
enum { isPointer = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
|
enum { isPointer = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
|
||||||
|
|
||||||
enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
|
enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
|
||||||
enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
|
enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
|
||||||
enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };
|
enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };
|
||||||
enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };
|
enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };
|
||||||
enum { isArith = isIntegral || isFloat };
|
enum { isArith = isIntegral || isFloat };
|
||||||
enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };
|
enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };
|
||||||
|
|
||||||
typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
|
typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
|
||||||
T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
|
T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
|
||||||
};
|
};
|
||||||
}}}
|
}}}
|
||||||
|
@ -47,17 +47,17 @@
|
|||||||
#include "datamov_utils.hpp"
|
#include "datamov_utils.hpp"
|
||||||
#include "detail/utility_detail.hpp"
|
#include "detail/utility_detail.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
||||||
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
|
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
|
||||||
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
||||||
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
|
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// swap
|
// swap
|
||||||
|
|
||||||
template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
|
template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
|
||||||
{
|
{
|
||||||
const T temp = a;
|
const T temp = a;
|
||||||
a = b;
|
a = b;
|
||||||
@ -71,9 +71,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
|
explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
|
||||||
__host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
|
__host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
|
||||||
|
|
||||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||||
{
|
{
|
||||||
return mask.ptr(y)[x] != 0;
|
return mask.ptr(y)[x] != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,13 +82,13 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
struct SingleMaskChannels
|
struct SingleMaskChannels
|
||||||
{
|
{
|
||||||
__host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
|
__host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
|
||||||
: mask(mask_), channels(channels_) {}
|
: mask(mask_), channels(channels_) {}
|
||||||
__host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
|
__host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
|
||||||
:mask(mask_.mask), channels(mask_.channels){}
|
:mask(mask_.mask), channels(mask_.channels){}
|
||||||
|
|
||||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||||
{
|
{
|
||||||
return mask.ptr(y)[x / channels] != 0;
|
return mask.ptr(y)[x / channels] != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
curMask = maskCollection[z];
|
curMask = maskCollection[z];
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||||
{
|
{
|
||||||
uchar val;
|
uchar val;
|
||||||
@ -165,20 +165,20 @@ namespace cv { namespace gpu { namespace device
|
|||||||
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
|
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int n, typename T, typename V, typename Pred>
|
template <int n, typename T, typename V, typename Pred>
|
||||||
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
|
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
|
||||||
{
|
{
|
||||||
StaticAssert<n >= 8 && n <= 512>::check();
|
StaticAssert<n >= 8 && n <= 512>::check();
|
||||||
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
|
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int n, typename T, typename V1, typename V2, typename Pred>
|
template <int n, typename T, typename V1, typename V2, typename Pred>
|
||||||
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
|
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
|
||||||
{
|
{
|
||||||
StaticAssert<n >= 8 && n <= 512>::check();
|
StaticAssert<n >= 8 && n <= 512>::check();
|
||||||
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
|
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// Solve linear system
|
// Solve linear system
|
||||||
|
|
||||||
@ -212,17 +212,17 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
double invdet = 1.0 / det;
|
double invdet = 1.0 / det;
|
||||||
|
|
||||||
x[0] = saturate_cast<T>(invdet *
|
x[0] = saturate_cast<T>(invdet *
|
||||||
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
|
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
|
||||||
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
|
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
|
||||||
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )));
|
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )));
|
||||||
|
|
||||||
x[1] = saturate_cast<T>(invdet *
|
x[1] = saturate_cast<T>(invdet *
|
||||||
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
|
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
|
||||||
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
|
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
|
||||||
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])));
|
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])));
|
||||||
|
|
||||||
x[2] = saturate_cast<T>(invdet *
|
x[2] = saturate_cast<T>(invdet *
|
||||||
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
|
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
|
||||||
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
|
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
|
||||||
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
|
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
|
||||||
|
@ -47,7 +47,7 @@
|
|||||||
#include "functional.hpp"
|
#include "functional.hpp"
|
||||||
#include "detail/vec_distance_detail.hpp"
|
#include "detail/vec_distance_detail.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
template <typename T> struct L1Dist
|
template <typename T> struct L1Dist
|
||||||
{
|
{
|
||||||
@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
};
|
};
|
||||||
|
|
||||||
// calc distance between two vectors in global memory
|
// calc distance between two vectors in global memory
|
||||||
template <int THREAD_DIM, typename Dist, typename T1, typename T2>
|
template <int THREAD_DIM, typename Dist, typename T1, typename T2>
|
||||||
__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
||||||
{
|
{
|
||||||
for (int i = tid; i < len; i += THREAD_DIM)
|
for (int i = tid; i < len; i += THREAD_DIM)
|
||||||
@ -170,9 +170,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
|
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
|
||||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
|
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
|
||||||
__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
||||||
{
|
{
|
||||||
vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
|
vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
|
||||||
|
|
||||||
dist.reduceAll<THREAD_DIM>(smem, tid);
|
dist.reduceAll<THREAD_DIM>(smem, tid);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@
|
|||||||
#include "vec_traits.hpp"
|
#include "vec_traits.hpp"
|
||||||
#include "functional.hpp"
|
#include "functional.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
namespace vec_math_detail
|
namespace vec_math_detail
|
||||||
{
|
{
|
||||||
@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace vec_math_detail
|
namespace vec_math_detail
|
||||||
{
|
{
|
||||||
template <typename T1, typename T2> struct BinOpTraits
|
template <typename T1, typename T2> struct BinOpTraits
|
||||||
{
|
{
|
||||||
typedef int argument_type;
|
typedef int argument_type;
|
||||||
@ -326,5 +326,5 @@ namespace cv { namespace gpu { namespace device
|
|||||||
#undef OPENCV_GPU_IMPLEMENT_VEC_OP
|
#undef OPENCV_GPU_IMPLEMENT_VEC_OP
|
||||||
#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
|
#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
|
||||||
}}} // namespace cv { namespace gpu { namespace device
|
}}} // namespace cv { namespace gpu { namespace device
|
||||||
|
|
||||||
#endif // __OPENCV_GPU_VECMATH_HPP__
|
#endif // __OPENCV_GPU_VECMATH_HPP__
|
@ -45,7 +45,7 @@
|
|||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
template<typename T, int N> struct TypeVec;
|
template<typename T, int N> struct TypeVec;
|
||||||
|
|
||||||
@ -219,18 +219,18 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
|
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
|
||||||
|
|
||||||
template<> struct VecTraits<char>
|
template<> struct VecTraits<char>
|
||||||
{
|
{
|
||||||
typedef char elem_type;
|
typedef char elem_type;
|
||||||
enum {cn=1};
|
enum {cn=1};
|
||||||
static __device__ __host__ __forceinline__ char all(char v) {return v;}
|
static __device__ __host__ __forceinline__ char all(char v) {return v;}
|
||||||
static __device__ __host__ __forceinline__ char make(char x) {return x;}
|
static __device__ __host__ __forceinline__ char make(char x) {return x;}
|
||||||
static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
|
static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
|
||||||
};
|
};
|
||||||
template<> struct VecTraits<schar>
|
template<> struct VecTraits<schar>
|
||||||
{
|
{
|
||||||
typedef schar elem_type;
|
typedef schar elem_type;
|
||||||
enum {cn=1};
|
enum {cn=1};
|
||||||
static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
|
static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
|
||||||
static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
|
static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
|
||||||
static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
|
static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
|
||||||
|
@ -43,7 +43,7 @@
|
|||||||
#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
|
#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
|
||||||
#define __OPENCV_GPU_DEVICE_WARP_HPP__
|
#define __OPENCV_GPU_DEVICE_WARP_HPP__
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
struct Warp
|
struct Warp
|
||||||
{
|
{
|
||||||
@ -64,18 +64,18 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
template<typename It, typename T>
|
template<typename It, typename T>
|
||||||
static __device__ __forceinline__ void fill(It beg, It end, const T& value)
|
static __device__ __forceinline__ void fill(It beg, It end, const T& value)
|
||||||
{
|
{
|
||||||
for(It t = beg + laneId(); t < end; t += STRIDE)
|
for(It t = beg + laneId(); t < end; t += STRIDE)
|
||||||
*t = value;
|
*t = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename InIt, typename OutIt>
|
template<typename InIt, typename OutIt>
|
||||||
static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
|
static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
|
||||||
{
|
{
|
||||||
for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
|
for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
|
||||||
*out = *t;
|
*out = *t;
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename InIt, typename OutIt, class UnOp>
|
template<typename InIt, typename OutIt, class UnOp>
|
||||||
static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
|
static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
|
||||||
@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
unsigned int lane = laneId();
|
unsigned int lane = laneId();
|
||||||
|
|
||||||
InIt1 t1 = beg1 + lane;
|
InIt1 t1 = beg1 + lane;
|
||||||
InIt2 t2 = beg2 + lane;
|
InIt2 t2 = beg2 + lane;
|
||||||
for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
|
for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
|
||||||
*out = op(*t1, *t2);
|
*out = op(*t1, *t2);
|
||||||
@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
template<typename OutIt, typename T>
|
template<typename OutIt, typename T>
|
||||||
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
|
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
|
||||||
{
|
{
|
||||||
unsigned int lane = laneId();
|
unsigned int lane = laneId();
|
||||||
value += lane;
|
value += lane;
|
||||||
|
|
||||||
for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
|
for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
|
||||||
|
@ -80,51 +80,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
|
void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
|
||||||
DevMem2Db dst, int interpolation, cudaStream_t stream);
|
DevMem2Db dst, int interpolation, cudaStream_t stream);
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,
|
|
||||||
int interpolation, DevMem2Df buffer, cudaStream_t stream);
|
|
||||||
}
|
}
|
||||||
}}}
|
}}}
|
||||||
|
|
||||||
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize, double fx, double fy,
|
|
||||||
int interpolation, Stream& s)
|
|
||||||
{
|
|
||||||
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
|
|
||||||
CV_Assert(interpolation == INTER_AREA);
|
|
||||||
CV_Assert( (fx < 1.0) && (fy < 1.0));
|
|
||||||
CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
|
|
||||||
CV_Assert(src.cols >= 128 && src.rows >= 128);
|
|
||||||
CV_Assert((fx - 128.0) <= 0 && (fy - 128.0) <= 0);
|
|
||||||
|
|
||||||
if (dsize == Size())
|
|
||||||
dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
|
|
||||||
else
|
|
||||||
{
|
|
||||||
fx = static_cast<double>(dsize.width) / src.cols;
|
|
||||||
fy = static_cast<double>(dsize.height) / src.rows;
|
|
||||||
}
|
|
||||||
|
|
||||||
fx = static_cast<float>(1.0 / fx);
|
|
||||||
fy = static_cast<float>(1.0 / fy);
|
|
||||||
|
|
||||||
dst.create(dsize, src.type());
|
|
||||||
buffer.create(cv::Size(dsize.width, src.rows), CV_32FC1);
|
|
||||||
|
|
||||||
if (dsize == src.size())
|
|
||||||
{
|
|
||||||
if (s)
|
|
||||||
s.enqueueCopy(src, dst);
|
|
||||||
else
|
|
||||||
src.copyTo(dst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaStream_t stream = StreamAccessor::getStream(s);
|
|
||||||
|
|
||||||
cv::gpu::device::imgproc::resize_area_gpu<uchar>(src, dst, fx, fy, interpolation, buffer, stream);
|
|
||||||
}
|
|
||||||
|
|
||||||
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
|
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
|
||||||
{
|
{
|
||||||
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
|
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
|
||||||
|
@ -182,45 +182,6 @@ PARAM_TEST_CASE(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, double, Inte
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST_P(ResizeArea, Accuracy)
|
|
||||||
{
|
|
||||||
cv::Mat src = randomMat(size, type);
|
|
||||||
|
|
||||||
cv::gpu::GpuMat dst = createMat(cv::Size(cv::saturate_cast<int>(src.cols * coeff), cv::saturate_cast<int>(src.rows * coeff)), type, useRoi);
|
|
||||||
cv::gpu::GpuMat buffer = createMat(cv::Size(dst.cols, src.rows), CV_32FC1);
|
|
||||||
|
|
||||||
cv::gpu::resize(loadMat(src, useRoi), dst, buffer, cv::Size(), coeff, coeff, interpolation);
|
|
||||||
|
|
||||||
cv::Mat dst_cpu;
|
|
||||||
|
|
||||||
cv::resize(src, dst_cpu, cv::Size(), coeff, coeff, interpolation);
|
|
||||||
|
|
||||||
cv::Mat gpu_buff;
|
|
||||||
buffer.download(gpu_buff);
|
|
||||||
|
|
||||||
cv::Mat gpu;
|
|
||||||
dst.download(gpu);
|
|
||||||
|
|
||||||
// std::cout // << src
|
|
||||||
// // << std::endl << std::endl
|
|
||||||
// // << gpu_buff
|
|
||||||
// // << std::endl << std::endl
|
|
||||||
// << gpu
|
|
||||||
// << std::endl << std::endl
|
|
||||||
// << dst_cpu<< std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
EXPECT_MAT_NEAR(dst_cpu, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeArea, testing::Combine(
|
|
||||||
ALL_DEVICES,
|
|
||||||
testing::Values(cv::Size(640, 480)),//DIFFERENT_SIZES,
|
|
||||||
testing::Values(MatType(CV_8UC1)/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
|
|
||||||
testing::Values(0.05, 0.1),
|
|
||||||
testing::Values(Interpolation(cv::INTER_AREA)),
|
|
||||||
WHOLE_SUBMAT));
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Test NPP
|
// Test NPP
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user