scan operations are moved in separate header
This commit is contained in:
@@ -1253,7 +1253,7 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
const T val;
|
||||
|
||||
__host__ explicit CompareScalar(T val) : val(val) {}
|
||||
__host__ explicit CompareScalar(T val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ uchar operator()(T src) const
|
||||
{
|
||||
@@ -1266,7 +1266,7 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
const TYPE_VEC(T, 2) val;
|
||||
|
||||
__host__ explicit CompareScalar(TYPE_VEC(T, 2) val) : val(val) {}
|
||||
__host__ explicit CompareScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
|
||||
{
|
||||
@@ -1281,7 +1281,7 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
const TYPE_VEC(T, 3) val;
|
||||
|
||||
__host__ explicit CompareScalar(TYPE_VEC(T, 3) val) : val(val) {}
|
||||
__host__ explicit CompareScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
|
||||
{
|
||||
@@ -1297,7 +1297,7 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
const TYPE_VEC(T, 4) val;
|
||||
|
||||
__host__ explicit CompareScalar(TYPE_VEC(T, 4) val) : val(val) {}
|
||||
__host__ explicit CompareScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
|
||||
|
||||
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
|
||||
{
|
||||
|
||||
@@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
struct Mask8U
|
||||
{
|
||||
explicit Mask8U(PtrStepb mask): mask(mask) {}
|
||||
explicit Mask8U(PtrStepb mask_): mask(mask_) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||
{
|
||||
|
||||
@@ -46,7 +46,8 @@
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
# include <cfloat>
|
||||
#include <cfloat>
|
||||
#include <opencv2/gpu/device/scan.hpp>
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@@ -285,367 +286,5 @@ namespace cv { namespace gpu { namespace device
|
||||
typedef float scan_line_type;
|
||||
};
|
||||
|
||||
// template <typename T>
|
||||
// __global__ void resize_area_scan(const DevMem2D_<T> src, DevMem2D_<T> dst, int fx, int fy, DevMem2D_<T> buffer)
|
||||
// {
|
||||
// typedef typename scan_traits<T>::scan_line_type W;
|
||||
// extern __shared__ W line[];
|
||||
|
||||
// const int x = threadIdx.x;
|
||||
// const int y = blockIdx.x;
|
||||
|
||||
// if (y >= src.rows) return;
|
||||
|
||||
// int offset = 1;
|
||||
|
||||
// line[2 * x + 0] = src(y, 2 * x + 0);
|
||||
// line[2 * x + 1] = src(y, 2 * x + 1);
|
||||
|
||||
// __syncthreads();//???
|
||||
// // reduction
|
||||
// for (int d = blockDim.x; d > 0; d >>= 1)
|
||||
// {
|
||||
// __syncthreads();
|
||||
// if (x < d)
|
||||
// {
|
||||
// int ai = 2 * x * offset -1 + 1 * offset;
|
||||
// int bi = 2 * x * offset -1 + 2 * offset;
|
||||
// line[bi] += line[ai];
|
||||
// }
|
||||
|
||||
// offset *= 2;
|
||||
// }
|
||||
|
||||
// __syncthreads();
|
||||
// // convolution
|
||||
// if (x == 0) { line[(blockDim.x << 1) - 1] = 0; printf("offset: %d!!!!!!!!!!!!!\n", fx);}
|
||||
|
||||
// for (int d = 1; d < (blockDim.x << 1); d *= 2)
|
||||
// {
|
||||
// offset >>= 1;
|
||||
|
||||
// __syncthreads();
|
||||
// if (x < d)
|
||||
// {
|
||||
// int ai = offset * 2 * x + 1 * offset - 1;
|
||||
// int bi = offset * 2 * x + 2 * offset - 1;
|
||||
|
||||
// W t = line[ai];
|
||||
// line[ai] = line[bi];
|
||||
// line[bi] += t;
|
||||
// }
|
||||
// }
|
||||
// __syncthreads();
|
||||
|
||||
// // calculate sum
|
||||
// int start = 0;
|
||||
// int out_idx = 0;
|
||||
// int end = start + fx;
|
||||
// while (start < (blockDim.x << 1) && end < (blockDim.x << 1))
|
||||
// {
|
||||
// buffer(y, out_idx) = saturate_cast<T>((line[end] - line[start]) / fx);
|
||||
// start = end;
|
||||
// end = start + fx;
|
||||
// out_idx++;
|
||||
// }
|
||||
|
||||
// }
|
||||
|
||||
template <typename T>
|
||||
__device__ void scan_y(DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,int fx, int fy, DevMem2D_<T> dst,
|
||||
typename scan_traits<T>::scan_line_type* line, int g_base)
|
||||
{
|
||||
typedef typename scan_traits<T>::scan_line_type W;
|
||||
|
||||
const int y = threadIdx.x;
|
||||
const int x = blockIdx.x;
|
||||
|
||||
float scale = 1.f / (fx * fy);
|
||||
|
||||
if (x >= buffer.cols) return;
|
||||
|
||||
int offset = 1;
|
||||
line[2 * y + 0] = buffer((g_base * fy) + 2 * y + 1, x);
|
||||
|
||||
if (y != (blockDim.x -1) )
|
||||
line[2 * y + 1] = buffer((g_base * fy) + 2 * y + 2, x);
|
||||
else
|
||||
line[2 * y + 1] = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// reduction
|
||||
for (int d = blockDim.x; d > 0; d >>= 1)
|
||||
{
|
||||
__syncthreads();
|
||||
if (y < d)
|
||||
{
|
||||
int ai = 2 * y * offset -1 + 1 * offset;
|
||||
int bi = 2 * y * offset -1 + 2 * offset;
|
||||
line[bi] += line[ai];
|
||||
}
|
||||
|
||||
offset *= 2;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
// convolution
|
||||
if (y == 0) line[(blockDim.x << 1) - 1] = (W)buffer(0, x);
|
||||
|
||||
for (int d = 1; d < (blockDim.x << 1); d *= 2)
|
||||
{
|
||||
offset >>= 1;
|
||||
|
||||
__syncthreads();
|
||||
if (y < d)
|
||||
{
|
||||
int ai = offset * 2 * y + 1 * offset - 1;
|
||||
int bi = offset * 2 * y + 2 * offset - 1;
|
||||
|
||||
|
||||
W t = line[ai];
|
||||
line[ai] = line[bi];
|
||||
line[bi] += t;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (y < dst.rows)
|
||||
{
|
||||
W start = (y == 0)? (W)0:line[y * fy -1];
|
||||
W end = line[y * fy + fy - 1];
|
||||
dst(g_base + y ,x) = saturate_cast<T>((end - start) * scale);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void scan_x(const DevMem2D_<T> src, int fx, int fy, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer,
|
||||
typename scan_traits<T>::scan_line_type* line, int g_base)
|
||||
{
|
||||
typedef typename scan_traits<T>::scan_line_type W;
|
||||
|
||||
const int x = threadIdx.x;
|
||||
const int y = blockIdx.x;
|
||||
|
||||
float scale = 1.f / (fx * fy);
|
||||
|
||||
if (y >= src.rows) return;
|
||||
|
||||
int offset = 1;
|
||||
|
||||
line[2 * x + 0] = (W)src(y, (g_base * fx) + 2 * x + 1);
|
||||
|
||||
if (x != (blockDim.x -1) )
|
||||
line[2 * x + 1] = (W)src(y, (g_base * fx) + 2 * x + 2);
|
||||
else
|
||||
line[2 * x + 1] = 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// reduction
|
||||
for (int d = blockDim.x; d > 0; d >>= 1)
|
||||
{
|
||||
__syncthreads();
|
||||
if (x < d)
|
||||
{
|
||||
int ai = 2 * x * offset -1 + 1 * offset;
|
||||
int bi = 2 * x * offset -1 + 2 * offset;
|
||||
line[bi] += line[ai];
|
||||
}
|
||||
|
||||
offset *= 2;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
// convolution
|
||||
if (x == 0) line[(blockDim.x << 1) - 1] = (W)src(y, 0);
|
||||
|
||||
for (int d = 1; d < (blockDim.x << 1); d *= 2)
|
||||
{
|
||||
offset >>= 1;
|
||||
|
||||
__syncthreads();
|
||||
if (x < d)
|
||||
{
|
||||
int ai = offset * 2 * x + 1 * offset - 1;
|
||||
int bi = offset * 2 * x + 2 * offset - 1;
|
||||
|
||||
W t = line[ai];
|
||||
line[ai] = line[bi];
|
||||
line[bi] += t;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (x < buffer.cols)
|
||||
{
|
||||
W start = (x == 0)? (W)0:line[x * fx -1];
|
||||
W end = line[x * fx + fx - 1];
|
||||
buffer(y, g_base + x) =(end - start);
|
||||
}
|
||||
}
|
||||
|
||||
enum ScanKind { exclusive, inclusive } ;
|
||||
|
||||
template <ScanKind Kind , class T>
|
||||
__device__ __forceinline__ T scan_warp ( volatile T *ptr , const unsigned int idx = threadIdx.x )
|
||||
{
|
||||
const unsigned int lane = idx & 31;
|
||||
|
||||
if ( lane >= 1) ptr [idx ] = ptr [idx - 1] + ptr [idx];
|
||||
if ( lane >= 2) ptr [idx ] = ptr [idx - 2] + ptr [idx];
|
||||
if ( lane >= 4) ptr [idx ] = ptr [idx - 4] + ptr [idx];
|
||||
if ( lane >= 8) ptr [idx ] = ptr [idx - 8] + ptr [idx];
|
||||
if ( lane >= 16) ptr [idx ] = ptr [idx - 16] + ptr [idx];
|
||||
|
||||
if( Kind == inclusive )
|
||||
return ptr [idx ];
|
||||
else
|
||||
return (lane > 0) ? ptr [idx - 1] : 0;
|
||||
}
|
||||
|
||||
template <ScanKind Kind , class T>
|
||||
__device__ __forceinline__ T scan_block( volatile T *ptr)
|
||||
{
|
||||
const unsigned int idx = threadIdx.x;
|
||||
const unsigned int lane = idx & 31;
|
||||
const unsigned int warp = idx >> 5;
|
||||
|
||||
T val = scan_warp <Kind>( ptr , idx );
|
||||
__syncthreads ();
|
||||
|
||||
if( lane == 31 )
|
||||
ptr [ warp ] = ptr [idx ];
|
||||
|
||||
__syncthreads ();
|
||||
|
||||
if( warp == 0 )
|
||||
scan_warp<inclusive>( ptr , idx );
|
||||
|
||||
__syncthreads ();
|
||||
|
||||
if ( warp > 0)
|
||||
val = ptr [warp -1] + val;
|
||||
|
||||
__syncthreads ();
|
||||
|
||||
ptr[idx] = val;
|
||||
|
||||
__syncthreads ();
|
||||
|
||||
return val ;
|
||||
}
|
||||
|
||||
template<typename T, typename W>
|
||||
__global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines, int stride)
|
||||
{
|
||||
extern __shared__ W sbuf[];
|
||||
|
||||
const unsigned int tid = threadIdx. x;
|
||||
|
||||
// load line-block on shared memory
|
||||
int y = blockIdx.x / thred_lines;
|
||||
int input_stride = (blockIdx.x % thred_lines) * stride;
|
||||
int x = input_stride + tid;
|
||||
|
||||
// store global data in shared memory
|
||||
if (x < src.cols && y < src.rows)
|
||||
sbuf[tid] = src(y, x);
|
||||
else
|
||||
sbuf[tid] = 0;
|
||||
__syncthreads();
|
||||
|
||||
scan_block<inclusive, W>(sbuf);
|
||||
|
||||
float scale = __fdividef(1.f, fx);
|
||||
int out_stride = input_stride / fx;
|
||||
int count = blockDim.x / fx;
|
||||
|
||||
if (tid < count)
|
||||
{
|
||||
int start_idx = (tid == 0)? 0 : tid * fx - 1;
|
||||
int end_idx = tid * fx + fx - 1;
|
||||
|
||||
W start = (tid == 0)? (W)0:sbuf[start_idx];
|
||||
W end = sbuf[end_idx];
|
||||
|
||||
dst(y, out_stride + tid) = (end - start);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename W>
|
||||
__global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines, int stride)
|
||||
{
|
||||
extern __shared__ W sbuf[];
|
||||
|
||||
const unsigned int tid = threadIdx. x;
|
||||
|
||||
// load line-block on shared memory
|
||||
int x = blockIdx.x / thred_lines;
|
||||
|
||||
int global_stride = (blockIdx.x % thred_lines) * stride;
|
||||
int y = global_stride + tid;
|
||||
|
||||
// store global data in shared memory
|
||||
if (x < src.cols && y < src.rows)
|
||||
sbuf[tid] = src(y, x);
|
||||
else
|
||||
sbuf[tid] = 0;
|
||||
|
||||
__syncthreads();
|
||||
scan_block<inclusive, W>(sbuf);
|
||||
|
||||
float scale = __fdividef(1.f, fx * fy);
|
||||
int out_stride = global_stride / fx;
|
||||
int count = blockDim.x / fx;
|
||||
|
||||
if (tid < count)
|
||||
{
|
||||
int start_idx = (tid == 0)? 0 : tid * fx - 1;
|
||||
int end_idx = tid * fx + fx - 1;
|
||||
|
||||
W start = (tid == 0)? (W)0:sbuf[start_idx];
|
||||
W end = sbuf[end_idx];
|
||||
|
||||
dst(out_stride + tid, x) = saturate_cast<T>((end - start) * scale);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,
|
||||
int interpolation, DevMem2Df buffer, cudaStream_t stream)
|
||||
{
|
||||
(void)interpolation;
|
||||
|
||||
int iscale_x = round(fx);
|
||||
int iscale_y = round(fy);
|
||||
|
||||
int warps = 4;
|
||||
const int threads = 32 * warps;
|
||||
int input_stride = threads / iscale_x;
|
||||
|
||||
int thred_lines = divUp(src.cols, input_stride * iscale_x);
|
||||
int blocks = src.rows * thred_lines;
|
||||
|
||||
typedef typename scan_traits<T>::scan_line_type smem_type;
|
||||
|
||||
resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
|
||||
(src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x);
|
||||
|
||||
input_stride = threads / iscale_y;
|
||||
thred_lines = divUp(src.rows, input_stride * iscale_y);
|
||||
blocks = dst.cols * thred_lines;
|
||||
|
||||
resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
|
||||
(buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void resize_area_gpu<uchar>(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream);
|
||||
|
||||
} // namespace imgproc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
@@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
mergeC2_<T><<<grid, block, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
@@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
mergeC3_<T><<<grid, block, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
@@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
mergeC4_<T><<<grid, block, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
@@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
splitC2_<T><<<grid, block, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step);
|
||||
@@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
splitC3_<T><<<grid, block, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
@@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
splitC4_<T><<<grid, block, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
|
||||
Reference in New Issue
Block a user