refactored and fixed some gpu tests
fixed some bugs in gpu module
This commit is contained in:
@@ -91,6 +91,12 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
|
||||
bool tr2 = (flags & GEMM_2_T) != 0;
|
||||
bool tr3 = (flags & GEMM_3_T) != 0;
|
||||
|
||||
if (src1.type() == CV_64FC2)
|
||||
{
|
||||
if (tr1 || tr2 || tr3)
|
||||
CV_Error(CV_StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
|
||||
}
|
||||
|
||||
Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
|
||||
Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
|
||||
Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
|
||||
@@ -99,7 +105,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
|
||||
CV_Assert(src1Size.width == src2Size.height);
|
||||
CV_Assert(src3.empty() || src3Size == dstSize);
|
||||
|
||||
dst.create(dstSize, CV_32FC1);
|
||||
dst.create(dstSize, src1.type());
|
||||
|
||||
if (beta != 0)
|
||||
{
|
||||
@@ -149,7 +155,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
|
||||
{
|
||||
case CV_32FC1:
|
||||
cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
|
||||
&alphaf,
|
||||
&alphaf,
|
||||
src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
|
||||
src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
|
||||
&betaf,
|
||||
@@ -158,7 +164,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
|
||||
|
||||
case CV_64FC1:
|
||||
cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
|
||||
&alpha,
|
||||
&alpha,
|
||||
src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
|
||||
src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
|
||||
&beta,
|
||||
@@ -167,7 +173,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
|
||||
|
||||
case CV_32FC2:
|
||||
cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
|
||||
&alphacf,
|
||||
&alphacf,
|
||||
src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
|
||||
src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
|
||||
&betacf,
|
||||
@@ -176,7 +182,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
|
||||
|
||||
case CV_64FC2:
|
||||
cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
|
||||
&alphac,
|
||||
&alphac,
|
||||
src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
|
||||
src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
|
||||
&betac,
|
||||
@@ -208,8 +214,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
|
||||
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
|
||||
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
|
||||
}
|
||||
else if (src.elemSize() == 4)
|
||||
{
|
||||
@@ -219,7 +225,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
|
||||
ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step),
|
||||
ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step),
|
||||
dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
|
||||
}
|
||||
else // if (src.elemSize() == 8)
|
||||
@@ -230,8 +236,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
|
||||
sz.width = src.cols;
|
||||
sz.height = src.rows;
|
||||
|
||||
ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step),
|
||||
dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );
|
||||
ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step),
|
||||
dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
@@ -285,7 +291,7 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
|
||||
|
||||
static const func_t funcs[6][4] =
|
||||
static const func_t funcs[6][4] =
|
||||
{
|
||||
{NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
|
||||
{0,0,0,0},
|
||||
@@ -345,7 +351,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
|
||||
|
||||
if (src.type() == CV_8UC1)
|
||||
{
|
||||
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
|
||||
}
|
||||
else
|
||||
@@ -361,7 +367,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
|
||||
pValues3[1] = nppLut3[1].ptr<Npp32s>();
|
||||
pValues3[2] = nppLut3[2].ptr<Npp32s>();
|
||||
}
|
||||
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
|
||||
}
|
||||
|
||||
@@ -408,9 +414,9 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Polar <-> Cart
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace mathfunc
|
||||
namespace mathfunc
|
||||
{
|
||||
void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
|
||||
void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
|
||||
|
@@ -1672,40 +1672,53 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
|
||||
{
|
||||
float power;
|
||||
PowOp(float power_) : power(power_) {}
|
||||
const float power;
|
||||
|
||||
__device__ __forceinline__ T operator()(const T& e) const
|
||||
PowOp(double power_) : power(static_cast<float>(power_)) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(T e) const
|
||||
{
|
||||
return saturate_cast<T>(__powf((float)e, power));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T> struct PowOp<T, true> : unary_function<T, T>
|
||||
{
|
||||
float power;
|
||||
PowOp(float power_) : power(power_) {}
|
||||
const float power;
|
||||
|
||||
__device__ __forceinline__ float operator()(const T& e) const
|
||||
PowOp(double power_) : power(static_cast<float>(power_)) {}
|
||||
|
||||
__device__ __forceinline__ T operator()(T e) const
|
||||
{
|
||||
T res = saturate_cast<T>(__powf((float)e, power));
|
||||
|
||||
if ( (e < 0) && (1 & (int)power) )
|
||||
res *= -1;
|
||||
if ((e < 0) && (1 & static_cast<int>(power)))
|
||||
res *= -1;
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct PowOp<float> : unary_function<float, float>
|
||||
{
|
||||
float power;
|
||||
PowOp(float power_) : power(power_) {}
|
||||
const float power;
|
||||
|
||||
__device__ __forceinline__ float operator()(const float& e) const
|
||||
PowOp(double power_) : power(static_cast<float>(power_)) {}
|
||||
|
||||
__device__ __forceinline__ float operator()(float e) const
|
||||
{
|
||||
return __powf(::fabs(e), power);
|
||||
}
|
||||
};
|
||||
template<> struct PowOp<double> : unary_function<double, double>
|
||||
{
|
||||
const double power;
|
||||
|
||||
PowOp(double power_) : power(power_) {}
|
||||
|
||||
__device__ __forceinline__ double operator()(double e) const
|
||||
{
|
||||
return ::pow(::fabs(e), power);
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail
|
||||
{
|
||||
@@ -1733,17 +1746,18 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)
|
||||
void pow_caller(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream)
|
||||
{
|
||||
cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<short>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<ushort>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<int>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<float>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<uchar>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<schar>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<short>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<ushort>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<int>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<float>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
|
||||
template void pow_caller<double>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// addWeighted
|
||||
|
@@ -1301,50 +1301,26 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
|
||||
};
|
||||
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
|
||||
|
||||
int code;
|
||||
const GpuMat* psrc1;
|
||||
const GpuMat* psrc2;
|
||||
|
||||
switch (cmpop)
|
||||
static const int codes[] =
|
||||
{
|
||||
case CMP_EQ:
|
||||
code = 0;
|
||||
psrc1 = &src1;
|
||||
psrc2 = &src2;
|
||||
break;
|
||||
case CMP_GE:
|
||||
code = 3;
|
||||
psrc1 = &src2;
|
||||
psrc2 = &src1;
|
||||
break;
|
||||
case CMP_GT:
|
||||
code = 2;
|
||||
psrc1 = &src2;
|
||||
psrc2 = &src1;
|
||||
break;
|
||||
case CMP_LE:
|
||||
code = 3;
|
||||
psrc1 = &src1;
|
||||
psrc2 = &src2;
|
||||
break;
|
||||
case CMP_LT:
|
||||
code = 2;
|
||||
psrc1 = &src1;
|
||||
psrc2 = &src2;
|
||||
break;
|
||||
case CMP_NE:
|
||||
code = 1;
|
||||
psrc1 = &src1;
|
||||
psrc2 = &src2;
|
||||
break;
|
||||
default:
|
||||
CV_Error(CV_StsBadFlag, "Incorrect compare operation");
|
||||
0, 2, 3, 2, 3, 1
|
||||
};
|
||||
|
||||
const GpuMat* psrc1[] =
|
||||
{
|
||||
&src1, &src2, &src2, &src1, &src1, &src1
|
||||
};
|
||||
|
||||
const GpuMat* psrc2[] =
|
||||
{
|
||||
&src2, &src1, &src1, &src2, &src2, &src2
|
||||
};
|
||||
|
||||
dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, src1.channels()));
|
||||
|
||||
funcs[src1.depth()][code](psrc1->reshape(1), psrc2->reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
funcs[src1.depth()][codes[cmpop]](psrc1[cmpop]->reshape(1), psrc2[cmpop]->reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
|
||||
@@ -1944,26 +1920,25 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template<typename T>
|
||||
void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
void pow_caller(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
|
||||
{
|
||||
using namespace ::cv::gpu::device;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
CV_Assert(src.depth() != CV_64F);
|
||||
dst.create(src.size(), src.type());
|
||||
typedef void (*func_t)(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[] =
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
pow_caller<unsigned char>, pow_caller<signed char>,
|
||||
pow_caller<unsigned short>, pow_caller<short>,
|
||||
pow_caller<int>, pow_caller<float>
|
||||
pow_caller<int>, pow_caller<float>, pow_caller<double>
|
||||
};
|
||||
|
||||
callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
funcs[src.depth()](src.reshape(1), power, dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@@ -2052,27 +2027,11 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
|
||||
{
|
||||
using namespace ::cv::gpu::device;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
CV_Assert(src1.size() == src2.size());
|
||||
CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
|
||||
typedef void (*func_t)(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type();
|
||||
|
||||
dst.create(src1.size(), dtype);
|
||||
|
||||
const GpuMat* psrc1 = &src1;
|
||||
const GpuMat* psrc2 = &src2;
|
||||
|
||||
if (src1.depth() > src2.depth())
|
||||
{
|
||||
std::swap(psrc1, psrc2);
|
||||
std::swap(alpha, beta);
|
||||
}
|
||||
|
||||
typedef void (*caller_t)(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
|
||||
|
||||
static const caller_t callers[7][7][7] =
|
||||
static const func_t funcs[7][7][7] =
|
||||
{
|
||||
{
|
||||
{
|
||||
@@ -2531,7 +2490,26 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
|
||||
}
|
||||
};
|
||||
|
||||
callers[psrc1->depth()][psrc2->depth()][dst.depth()](psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
CV_Assert(src1.size() == src2.size());
|
||||
CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
|
||||
|
||||
dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type();
|
||||
|
||||
dst.create(src1.size(), dtype);
|
||||
|
||||
const GpuMat* psrc1 = &src1;
|
||||
const GpuMat* psrc2 = &src2;
|
||||
|
||||
if (src1.depth() > src2.depth())
|
||||
{
|
||||
std::swap(psrc1, psrc2);
|
||||
std::swap(alpha, beta);
|
||||
}
|
||||
|
||||
const func_t func = funcs[psrc1->depth()][psrc2->depth()][dst.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
func(psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user