GPU: updated upsample, downsample functions, added pyrDown, pyrUp, added support of 16S filtering; put spherical warper on GPU (from opencv_stitching)
This commit is contained in:
@@ -647,4 +647,26 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
template void threshold_gpu<int>(const DevMem2D& src, const DevMem2D& dst, int thresh, int maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<float>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
|
||||
template void threshold_gpu<double>(const DevMem2D& src, const DevMem2D& dst, double thresh, double maxVal, int type, cudaStream_t stream);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// subtract
|
||||
|
||||
template <typename T>
|
||||
class SubtractOp
|
||||
{
|
||||
public:
|
||||
__device__ __forceinline__ T operator()(const T& l, const T& r) const
|
||||
{
|
||||
return l - r;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
|
||||
{
|
||||
transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, SubtractOp<T>(), stream);
|
||||
}
|
||||
|
||||
template void subtractCaller<short>(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
@@ -224,6 +224,7 @@ namespace cv { namespace gpu { namespace filters
|
||||
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<short3, float3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}}}
|
||||
@@ -275,7 +276,7 @@ namespace cv { namespace gpu { namespace filters
|
||||
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
|
||||
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
|
||||
|
||||
B<T> b(src.rows, src.step / src.elemSize());
|
||||
B<T> b(src.rows, src.step);
|
||||
|
||||
if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
|
||||
{
|
||||
@@ -364,6 +365,7 @@ namespace cv { namespace gpu { namespace filters
|
||||
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
@@ -42,14 +42,6 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
|
||||
#ifndef CV_PI_F
|
||||
#ifndef CV_PI
|
||||
#define CV_PI_F 3.14159265f
|
||||
#else
|
||||
#define CV_PI_F ((float)CV_PI)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Other values are not supported
|
||||
#define CELL_WIDTH 8
|
||||
#define CELL_HEIGHT 8
|
||||
@@ -776,4 +768,4 @@ static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
|
||||
void resize_8UC1(const DevMem2D& src, DevMem2D dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
|
||||
void resize_8UC4(const DevMem2D& src, DevMem2D dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
|
||||
|
||||
}}}
|
||||
}}}
|
||||
|
||||
@@ -66,8 +66,8 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void remap_3c(const uchar* src, size_t src_step, const float* mapx, const float* mapy, size_t map_step,
|
||||
uchar* dst, size_t dst_step, int width, int height)
|
||||
__global__ void remap_3c(const uchar* src, size_t src_step, const float* mapx, const float* mapy,
|
||||
size_t map_step, uchar* dst, size_t dst_step, int width, int height)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
@@ -131,7 +131,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
grid.x = divUp(dst.cols, threads.x);
|
||||
grid.y = divUp(dst.rows, threads.y);
|
||||
|
||||
tex_remap.filterMode = cudaFilterModeLinear;
|
||||
tex_remap.filterMode = cudaFilterModeLinear;
|
||||
tex_remap.addressMode[0] = tex_remap.addressMode[1] = cudaAddressModeWrap;
|
||||
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
|
||||
cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
|
||||
@@ -139,7 +139,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
cudaSafeCall( cudaUnbindTexture(tex_remap) );
|
||||
}
|
||||
|
||||
@@ -151,9 +151,9 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
grid.y = divUp(dst.rows, threads.y);
|
||||
|
||||
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
|
||||
@@ -768,6 +768,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// mulSpectrums
|
||||
|
||||
@@ -796,6 +797,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// mulSpectrums_CONJ
|
||||
|
||||
@@ -825,6 +827,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// mulAndScaleSpectrums
|
||||
|
||||
@@ -855,6 +858,7 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// mulAndScaleSpectrums_CONJ
|
||||
|
||||
@@ -885,34 +889,173 @@ namespace cv { namespace gpu { namespace imgproc
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// downsample
|
||||
|
||||
template <typename T>
|
||||
__global__ void downsampleKernel(const PtrStep_<T> src, int rows, int cols, int k, PtrStep_<T> dst)
|
||||
template <typename T, int cn>
|
||||
__global__ void downsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < cols && y < rows)
|
||||
dst.ptr(y)[x] = src.ptr(y * k)[x * k];
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
int ch_x = x / cn;
|
||||
dst.ptr(y)[x] = src.ptr(y*2)[ch_x*2*cn + x - ch_x*cn];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void downsampleCaller(const PtrStep_<T> src, int rows, int cols, int k, PtrStep_<T> dst)
|
||||
template <typename T, int cn>
|
||||
void downsampleCaller(const DevMem2D src, DevMem2D dst)
|
||||
{
|
||||
dim3 threads(16, 16);
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
|
||||
|
||||
downsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
|
||||
cudaSafeCall(cudaGetLastError());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
|
||||
template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// upsample
|
||||
|
||||
template <typename T, int cn>
|
||||
__global__ void upsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x < dst.cols && y < dst.rows)
|
||||
{
|
||||
int ch_x = x / cn;
|
||||
T val = ((ch_x & 1) || (y & 1)) ? 0 : src.ptr(y/2)[ch_x/2*cn + x - ch_x*cn];
|
||||
dst.ptr(y)[x] = val;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T, int cn>
|
||||
void upsampleCaller(const DevMem2D src, DevMem2D dst)
|
||||
{
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
|
||||
|
||||
upsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
|
||||
cudaSafeCall(cudaGetLastError());
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
||||
template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
|
||||
template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// buildWarpMaps
|
||||
|
||||
namespace build_warp_maps
|
||||
{
|
||||
__constant__ float cr[9];
|
||||
__constant__ float crinv[9];
|
||||
__constant__ float cf, cs;
|
||||
__constant__ float chalf_w, chalf_h;
|
||||
}
|
||||
|
||||
|
||||
class SphericalMapper
|
||||
{
|
||||
public:
|
||||
static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
|
||||
{
|
||||
using namespace build_warp_maps;
|
||||
|
||||
v /= cs;
|
||||
u /= cs;
|
||||
|
||||
float sinv = sinf(v);
|
||||
float x_ = sinv * sinf(u);
|
||||
float y_ = -cosf(v);
|
||||
float z_ = sinv * cosf(u);
|
||||
|
||||
float z;
|
||||
x = crinv[0]*x_ + crinv[1]*y_ + crinv[2]*z_;
|
||||
y = crinv[3]*x_ + crinv[4]*y_ + crinv[5]*z_;
|
||||
z = crinv[6]*x_ + crinv[7]*y_ + crinv[8]*z_;
|
||||
|
||||
x = cf*x/z + chalf_w;
|
||||
y = cf*y/z + chalf_h;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename Mapper>
|
||||
__global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
|
||||
PtrStepf map_x, PtrStepf map_y)
|
||||
{
|
||||
int du = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int dv = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (du < cols && dv < rows)
|
||||
{
|
||||
float u = tl_u + du;
|
||||
float v = tl_v + dv;
|
||||
float x, y;
|
||||
Mapper::mapBackward(u, v, x, y);
|
||||
map_x.ptr(dv)[du] = x;
|
||||
map_y.ptr(dv)[du] = y;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
|
||||
const float r[9], const float rinv[9], float f, float s,
|
||||
float half_w, float half_h, cudaStream_t stream)
|
||||
{
|
||||
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr, r, 9*sizeof(float)));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::crinv, rinv, 9*sizeof(float)));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cf, &f, sizeof(float)));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cs, &s, sizeof(float)));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::chalf_w, &half_w, sizeof(float)));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::chalf_h, &half_h, sizeof(float)));
|
||||
|
||||
int cols = map_x.cols;
|
||||
int rows = map_x.rows;
|
||||
|
||||
dim3 threads(32, 8);
|
||||
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
|
||||
|
||||
downsampleKernel<<<grid, threads>>>(src, rows, cols, k, dst);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
|
||||
cudaSafeCall(cudaGetLastError());
|
||||
if (stream == 0)
|
||||
cudaSafeCall(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst);
|
||||
template void downsampleCaller(const PtrStepf src, int rows, int cols, int k, PtrStepf dst);
|
||||
|
||||
}}}
|
||||
|
||||
|
||||
@@ -49,6 +49,14 @@
|
||||
#include "npp.h"
|
||||
#include "NPP_staging.hpp"
|
||||
|
||||
#ifndef CV_PI_F
|
||||
#ifndef CV_PI
|
||||
#define CV_PI_F 3.14159265f
|
||||
#else
|
||||
#define CV_PI_F ((float)CV_PI)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace gpu
|
||||
|
||||
Reference in New Issue
Block a user