Revert "Merge pull request #836 from jet47:gpu-modules"

This reverts commit fba72cb60d, reversing
changes made to 02131ffb62.
This commit is contained in:
Andrey Kamaev
2013-04-18 15:03:50 +04:00
parent fba72cb60d
commit 416fb50594
472 changed files with 22945 additions and 29803 deletions

View File

@@ -1,735 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_no_cuda(); }
void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_no_cuda(); }
void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_no_cuda(); }
void cv::gpu::ConvolveBuf::create(Size, Size) { throw_no_cuda(); }
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_no_cuda(); }
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_no_cuda(); }
#else /* !defined (HAVE_CUDA) */
namespace
{
#define error_entry(entry) { entry, #entry }
struct ErrorEntry
{
int code;
const char* str;
};
struct ErrorEntryComparer
{
int code;
ErrorEntryComparer(int code_) : code(code_) {}
bool operator()(const ErrorEntry& e) const { return e.code == code; }
};
String getErrorString(int code, const ErrorEntry* errors, size_t n)
{
size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
const char* msg = (idx != n) ? errors[idx].str : "Unknown error code";
String str = cv::format("%s [Code = %d]", msg, code);
return str;
}
}
#ifdef HAVE_CUBLAS
namespace
{
const ErrorEntry cublas_errors[] =
{
error_entry( CUBLAS_STATUS_SUCCESS ),
error_entry( CUBLAS_STATUS_NOT_INITIALIZED ),
error_entry( CUBLAS_STATUS_ALLOC_FAILED ),
error_entry( CUBLAS_STATUS_INVALID_VALUE ),
error_entry( CUBLAS_STATUS_ARCH_MISMATCH ),
error_entry( CUBLAS_STATUS_MAPPING_ERROR ),
error_entry( CUBLAS_STATUS_EXECUTION_FAILED ),
error_entry( CUBLAS_STATUS_INTERNAL_ERROR )
};
const size_t cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]);
static inline void ___cublasSafeCall(cublasStatus_t err, const char* file, const int line, const char* func)
{
if (CUBLAS_STATUS_SUCCESS != err)
{
String msg = getErrorString(err, cublas_errors, cublas_error_num);
cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
}
}
}
#if defined(__GNUC__)
#define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
#else /* defined(__CUDACC__) || defined(__MSVC__) */
#define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__, "")
#endif
#endif // HAVE_CUBLAS
#ifdef HAVE_CUFFT
namespace
{
//////////////////////////////////////////////////////////////////////////
// CUFFT errors
const ErrorEntry cufft_errors[] =
{
error_entry( CUFFT_INVALID_PLAN ),
error_entry( CUFFT_ALLOC_FAILED ),
error_entry( CUFFT_INVALID_TYPE ),
error_entry( CUFFT_INVALID_VALUE ),
error_entry( CUFFT_INTERNAL_ERROR ),
error_entry( CUFFT_EXEC_FAILED ),
error_entry( CUFFT_SETUP_FAILED ),
error_entry( CUFFT_INVALID_SIZE ),
error_entry( CUFFT_UNALIGNED_DATA )
};
const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
void ___cufftSafeCall(int err, const char* file, const int line, const char* func)
{
if (CUFFT_SUCCESS != err)
{
String msg = getErrorString(err, cufft_errors, cufft_error_num);
cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
}
}
}
#if defined(__GNUC__)
#define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
#else /* defined(__CUDACC__) || defined(__MSVC__) */
#define cufftSafeCall(expr) ___cufftSafeCall(expr, __FILE__, __LINE__, "")
#endif
#endif
////////////////////////////////////////////////////////////////////////
// gemm
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
{
#ifndef HAVE_CUBLAS
(void)src1;
(void)src2;
(void)alpha;
(void)src3;
(void)beta;
(void)dst;
(void)flags;
(void)stream;
CV_Error(cv::Error::StsNotImplemented, "The library was build without CUBLAS");
#else
// CUBLAS works with column-major matrices
CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));
if (src1.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
bool tr1 = (flags & GEMM_1_T) != 0;
bool tr2 = (flags & GEMM_2_T) != 0;
bool tr3 = (flags & GEMM_3_T) != 0;
if (src1.type() == CV_64FC2)
{
if (tr1 || tr2 || tr3)
CV_Error(cv::Error::StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
}
Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
Size dstSize(src2Size.width, src1Size.height);
CV_Assert(src1Size.width == src2Size.height);
CV_Assert(src3.empty() || src3Size == dstSize);
dst.create(dstSize, src1.type());
if (beta != 0)
{
if (src3.empty())
{
if (stream)
stream.enqueueMemSet(dst, Scalar::all(0));
else
dst.setTo(Scalar::all(0));
}
else
{
if (tr3)
{
gpu::transpose(src3, dst, stream);
}
else
{
if (stream)
stream.enqueueCopy(src3, dst);
else
src3.copyTo(dst);
}
}
}
cublasHandle_t handle;
cublasSafeCall( cublasCreate_v2(&handle) );
cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) );
cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) );
const float alphaf = static_cast<float>(alpha);
const float betaf = static_cast<float>(beta);
const cuComplex alphacf = make_cuComplex(alphaf, 0);
const cuComplex betacf = make_cuComplex(betaf, 0);
const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);
cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;
switch (src1.type())
{
case CV_32FC1:
cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
&alphaf,
src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
&betaf,
dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
break;
case CV_64FC1:
cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
&alpha,
src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
&beta,
dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
break;
case CV_32FC2:
cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
&alphacf,
src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
&betacf,
dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
break;
case CV_64FC2:
cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
&alphac,
src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
&betac,
dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
break;
}
cublasSafeCall( cublasDestroy_v2(handle) );
#endif
}
////////////////////////////////////////////////////////////////////////
// integral
void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
{
GpuMat buffer;
gpu::integralBuffered(src, sum, buffer, s);
}
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
}
}}}
void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
{
CV_Assert(src.type() == CV_8UC1);
cudaStream_t stream = StreamAccessor::getStream(s);
cv::Size whole;
cv::Point offset;
src.locateROI(whole, offset);
if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
&& offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
{
ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream);
sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
if (s)
s.enqueueMemSet(sum, Scalar::all(0));
else
sum.setTo(Scalar::all(0));
GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
if (s)
s.enqueueCopy(res, inner);
else
res.copyTo(inner);
}
else
{
#ifndef HAVE_OPENCV_GPULEGACY
throw_no_cuda();
#else
sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
NcvSize32u roiSize;
roiSize.width = src.cols;
roiSize.height = src.rows;
cudaDeviceProp prop;
cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
Ncv32u bufSize;
ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
NppStStreamHandler h(stream);
ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
#endif
}
}
//////////////////////////////////////////////////////////////////////////////
// sqrIntegral
void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
{
#ifndef HAVE_OPENCV_GPULEGACY
(void) src;
(void) sqsum;
(void) s;
throw_no_cuda();
#else
CV_Assert(src.type() == CV_8U);
NcvSize32u roiSize;
roiSize.width = src.cols;
roiSize.height = src.rows;
cudaDeviceProp prop;
cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
Ncv32u bufSize;
ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
GpuMat buf(1, bufSize, CV_8U);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStStreamHandler h(stream);
sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
#endif
}
//////////////////////////////////////////////////////////////////////////////
// mulSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace gpu { namespace cudev
{
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) a;
(void) b;
(void) c;
(void) flags;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void) flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { cudev::mulSpectrums, cudev::mulSpectrums_CONJ };
CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
CV_Assert(a.size() == b.size());
c.create(a.size(), CV_32FC2);
Caller caller = callers[(int)conjB];
caller(a, b, c, StreamAccessor::getStream(stream));
#endif
}
//////////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace gpu { namespace cudev
{
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) a;
(void) b;
(void) c;
(void) flags;
(void) scale;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void)flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { cudev::mulAndScaleSpectrums, cudev::mulAndScaleSpectrums_CONJ };
CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
CV_Assert(a.size() == b.size());
c.create(a.size(), CV_32FC2);
Caller caller = callers[(int)conjB];
caller(a, b, scale, c, StreamAccessor::getStream(stream));
#endif
}
//////////////////////////////////////////////////////////////////////////////
// dft
void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) src;
(void) dst;
(void) dft_size;
(void) flags;
(void) stream;
throw_no_cuda();
#else
CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);
// We don't support unpacked output (in the case of real input)
CV_Assert(!(flags & DFT_COMPLEX_OUTPUT));
bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1);
int is_row_dft = flags & DFT_ROWS;
int is_scaled_dft = flags & DFT_SCALE;
int is_inverse = flags & DFT_INVERSE;
bool is_complex_input = src.channels() == 2;
bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
// We don't support real-to-real transform
CV_Assert(is_complex_input || is_complex_output);
GpuMat src_data;
// Make sure here we work with the continuous input,
// as CUFFT can't handle gaps
src_data = src;
createContinuous(src.rows, src.cols, src.type(), src_data);
if (src_data.data != src.data)
src.copyTo(src_data);
Size dft_size_opt = dft_size;
if (is_1d_input && !is_row_dft)
{
// If the source matrix is single column handle it as single row
dft_size_opt.width = std::max(dft_size.width, dft_size.height);
dft_size_opt.height = std::min(dft_size.width, dft_size.height);
}
cufftType dft_type = CUFFT_R2C;
if (is_complex_input)
dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;
CV_Assert(dft_size_opt.width > 1);
cufftHandle plan;
if (is_1d_input || is_row_dft)
cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height);
else
cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type);
cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
if (is_complex_input)
{
if (is_complex_output)
{
createContinuous(dft_size, CV_32FC2, dst);
cufftSafeCall(cufftExecC2C(
plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
}
else
{
createContinuous(dft_size, CV_32F, dst);
cufftSafeCall(cufftExecC2R(
plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
}
}
else
{
// We could swap dft_size for efficiency. Here we must reflect it
if (dft_size == dft_size_opt)
createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, dst);
else
createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, dst);
cufftSafeCall(cufftExecR2C(
plan, src_data.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
}
cufftSafeCall(cufftDestroy(plan));
if (is_scaled_dft)
multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream);
#endif
}
//////////////////////////////////////////////////////////////////////////////
// convolve
void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size)
{
result_size = Size(image_size.width - templ_size.width + 1,
image_size.height - templ_size.height + 1);
block_size = user_block_size;
if (user_block_size.width == 0 || user_block_size.height == 0)
block_size = estimateBlockSize(result_size, templ_size);
dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
// CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
// see CUDA Toolkit 4.1 CUFFT Library Programming Guide
if (dft_size.width > 8192)
dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
if (dft_size.height > 8192)
dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
// To avoid wasting time doing small DFTs
dft_size.width = std::max(dft_size.width, 512);
dft_size.height = std::max(dft_size.height, 512);
createContinuous(dft_size, CV_32F, image_block);
createContinuous(dft_size, CV_32F, templ_block);
createContinuous(dft_size, CV_32F, result_data);
spect_len = dft_size.height * (dft_size.width / 2 + 1);
createContinuous(1, spect_len, CV_32FC2, image_spect);
createContinuous(1, spect_len, CV_32FC2, templ_spect);
createContinuous(1, spect_len, CV_32FC2, result_spect);
// Use maximum result matrix block size for the estimated DFT block size
block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
}
Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/)
{
int width = (result_size.width + 2) / 3;
int height = (result_size.height + 2) / 3;
width = std::min(width, result_size.width);
height = std::min(height, result_size.height);
return Size(width, height);
}
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
{
ConvolveBuf buf;
gpu::convolve(image, templ, result, ccorr, buf);
}
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) image;
(void) templ;
(void) result;
(void) ccorr;
(void) buf;
(void) stream;
throw_no_cuda();
#else
using namespace cv::gpu::cudev::imgproc;
CV_Assert(image.type() == CV_32F);
CV_Assert(templ.type() == CV_32F);
buf.create(image.size(), templ.size());
result.create(buf.result_size, CV_32F);
Size& block_size = buf.block_size;
Size& dft_size = buf.dft_size;
GpuMat& image_block = buf.image_block;
GpuMat& templ_block = buf.templ_block;
GpuMat& result_data = buf.result_data;
GpuMat& image_spect = buf.image_spect;
GpuMat& templ_spect = buf.templ_spect;
GpuMat& result_spect = buf.result_spect;
cufftHandle planR2C, planC2R;
cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
templ_spect.ptr<cufftComplex>()));
// Process all blocks of the result matrix
for (int y = 0; y < result.rows; y += block_size.height)
{
for (int x = 0; x < result.cols; x += block_size.width)
{
Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
std::min(y + dft_size.height, image.rows) - y);
GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
image.step);
gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
image_spect.ptr<cufftComplex>()));
gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
1.f / dft_size.area(), ccorr, stream);
cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
result_data.ptr<cufftReal>()));
Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
std::min(y + block_size.height, result.rows) - y);
GpuMat result_roi(result_roi_size, result.type(),
(void*)(result.ptr<float>(y) + x), result.step);
GpuMat result_block(result_roi_size, result_data.type(),
result_data.ptr(), result_data.step);
if (stream)
stream.enqueueCopy(result_block, result_roi);
else
result_block.copyTo(result_roi);
}
}
cufftSafeCall(cufftDestroy(planR2C));
cufftSafeCall(cufftDestroy(planC2R));
#endif
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -1,488 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
void cv::gpu::merge(const std::vector<GpuMat>& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
void cv::gpu::split(const GpuMat& /*src*/, std::vector<GpuMat>& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); }
#else /* !defined (HAVE_CUDA) */
////////////////////////////////////////////////////////////////////////
// merge/split
namespace cv { namespace gpu { namespace cudev
{
namespace split_merge
{
void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
}
}}}
namespace
{
void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream)
{
using namespace ::cv::gpu::cudev::split_merge;
CV_Assert(src);
CV_Assert(n > 0);
int depth = src[0].depth();
Size size = src[0].size();
if (depth == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
bool single_channel_only = true;
int total_channels = 0;
for (size_t i = 0; i < n; ++i)
{
CV_Assert(src[i].size() == size);
CV_Assert(src[i].depth() == depth);
single_channel_only = single_channel_only && src[i].channels() == 1;
total_channels += src[i].channels();
}
CV_Assert(single_channel_only);
CV_Assert(total_channels <= 4);
if (total_channels == 1)
src[0].copyTo(dst);
else
{
dst.create(size, CV_MAKETYPE(depth, total_channels));
PtrStepSzb src_as_devmem[4];
for(size_t i = 0; i < n; ++i)
src_as_devmem[i] = src[i];
PtrStepSzb dst_as_devmem(dst);
merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);
}
}
void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream)
{
using namespace ::cv::gpu::cudev::split_merge;
CV_Assert(dst);
int depth = src.depth();
int num_channels = src.channels();
if (depth == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
if (num_channels == 1)
{
src.copyTo(dst[0]);
return;
}
for (int i = 0; i < num_channels; ++i)
dst[i].create(src.size(), depth);
CV_Assert(num_channels <= 4);
PtrStepSzb dst_as_devmem[4];
for (int i = 0; i < num_channels; ++i)
dst_as_devmem[i] = dst[i];
PtrStepSzb src_as_devmem(src);
split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);
}
}
void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream)
{
::merge(src, n, dst, StreamAccessor::getStream(stream));
}
void cv::gpu::merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream)
{
::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
}
void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream)
{
::split(src, dst, StreamAccessor::getStream(stream));
}
void cv::gpu::split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream)
{
dst.resize(src.channels());
if(src.channels() > 0)
::split(src, &dst[0], StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
// transpose
namespace arithm
{
template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
}
void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
{
CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
dst.create( src.cols, src.rows, src.type() );
cudaStream_t stream = StreamAccessor::getStream(s);
if (src.elemSize() == 1)
{
NppStreamHandler h(stream);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
else if (src.elemSize() == 4)
{
arithm::transpose<int>(src, dst, stream);
}
else // if (src.elemSize() == 8)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
arithm::transpose<double>(src, dst, stream);
}
}
////////////////////////////////////////////////////////////////////////
// flip
namespace
{
template<int DEPTH> struct NppTypeTraits;
template<> struct NppTypeTraits<CV_8U> { typedef Npp8u npp_t; };
template<> struct NppTypeTraits<CV_8S> { typedef Npp8s npp_t; };
template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
template <int DEPTH> struct NppMirrorFunc
{
typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
};
template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
{
typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
{
NppStreamHandler h(stream);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
(flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
}
void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
{
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
static const func_t funcs[6][4] =
{
{NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
{0,0,0,0},
{NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
{0,0,0,0},
{NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
{NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
};
CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
dst.create(src.size(), src.type());
funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
// LUT
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
{
const int cn = src.channels();
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
CV_Assert( lut.depth() == CV_8U );
CV_Assert( lut.channels() == 1 || lut.channels() == cn );
CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
NppiSize sz;
sz.height = src.rows;
sz.width = src.cols;
Mat nppLut;
lut.convertTo(nppLut, CV_32S);
int nValues3[] = {256, 256, 256};
Npp32s pLevels[256];
for (int i = 0; i < 256; ++i)
pLevels[i] = i;
const Npp32s* pLevels3[3];
#if (CUDA_VERSION <= 4020)
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
#else
GpuMat d_pLevels;
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
#endif
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
if (src.type() == CV_8UC1)
{
#if (CUDA_VERSION <= 4020)
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
#else
GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
#endif
}
else
{
const Npp32s* pValues3[3];
Mat nppLut3[3];
if (nppLut.channels() == 1)
{
#if (CUDA_VERSION <= 4020)
pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
#else
GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
#endif
}
else
{
cv::split(nppLut, nppLut3);
#if (CUDA_VERSION <= 4020)
pValues3[0] = nppLut3[0].ptr<Npp32s>();
pValues3[1] = nppLut3[1].ptr<Npp32s>();
pValues3[2] = nppLut3[2].ptr<Npp32s>();
#else
GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
pValues3[0] = d_nppLut0.ptr<Npp32s>();
pValues3[1] = d_nppLut1.ptr<Npp32s>();
pValues3[2] = d_nppLut2.ptr<Npp32s>();
#endif
}
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
// copyMakeBorder
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
}
}}}
namespace
{
template <typename T, int cn> void copyMakeBorder_caller(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
{
using namespace ::cv::gpu::cudev::imgproc;
Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
}
}
#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__ > 4
typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
#else
typedef Npp32s Npp32s_a;
#endif
void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
{
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
CV_Assert(borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
cudaStream_t stream = StreamAccessor::getStream(s);
if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
{
NppiSize srcsz;
srcsz.width = src.cols;
srcsz.height = src.rows;
NppiSize dstsz;
dstsz.width = dst.cols;
dstsz.height = dst.rows;
NppStreamHandler h(stream);
switch (src.type())
{
case CV_8UC1:
{
Npp8u nVal = saturate_cast<Npp8u>(value[0]);
nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
break;
}
case CV_8UC4:
{
Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
break;
}
case CV_32SC1:
{
Npp32s nVal = saturate_cast<Npp32s>(value[0]);
nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
break;
}
case CV_32FC1:
{
Npp32f val = saturate_cast<Npp32f>(value[0]);
Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
break;
}
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
else
{
typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
static const caller_t callers[6][4] =
{
{ copyMakeBorder_caller<uchar, 1> , copyMakeBorder_caller<uchar, 2> , copyMakeBorder_caller<uchar, 3> , copyMakeBorder_caller<uchar, 4>},
{0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
{ copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/, copyMakeBorder_caller<ushort, 3> , copyMakeBorder_caller<ushort, 4>},
{ copyMakeBorder_caller<short, 1> , 0/*copyMakeBorder_caller<short, 2>*/ , copyMakeBorder_caller<short, 3> , copyMakeBorder_caller<short, 4>},
{0/*copyMakeBorder_caller<int, 1>*/, 0/*copyMakeBorder_caller<int, 2>*/ , 0/*copyMakeBorder_caller<int, 3>*/, 0/*copyMakeBorder_caller<int , 4>*/},
{ copyMakeBorder_caller<float, 1> , 0/*copyMakeBorder_caller<float, 2>*/ , copyMakeBorder_caller<float, 3> , copyMakeBorder_caller<float ,4>}
};
caller_t func = callers[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
func(src, dst, top, left, borderType, value, stream);
}
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -1,147 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
struct VAbsDiff4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff4(a, b);
}
__device__ __forceinline__ VAbsDiff4() {}
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
};
struct VAbsDiff2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff2(a, b);
}
__device__ __forceinline__ VAbsDiff2() {}
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
};
__device__ __forceinline__ int _abs(int a)
{
return ::abs(a);
}
__device__ __forceinline__ float _abs(float a)
{
return ::fabsf(a);
}
__device__ __forceinline__ double _abs(double a)
{
return ::fabs(a);
}
template <typename T> struct AbsDiffMat : binary_function<T, T, T>
{
__device__ __forceinline__ T operator ()(T a, T b) const
{
return saturate_cast<T>(_abs(a - b));
}
__device__ __forceinline__ AbsDiffMat() {}
__device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
}
void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
}
template <typename T>
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
}
template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,98 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
{
S val;
explicit AbsDiffScalar(S val_) : val(val_) {}
__device__ __forceinline__ T operator ()(T a) const
{
abs_func<S> f;
return saturate_cast<T>(f(a - val));
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T, typename S>
void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
{
AbsDiffScalar<T, S> op(static_cast<S>(val));
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
}
template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,185 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
struct VAdd4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vadd4(a, b);
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4& other) {}
};
struct VAdd2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vadd2(a, b);
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2& other) {}
};
template <typename T, typename D> struct AddMat : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(a + b);
}
__device__ __forceinline__ AddMat() {}
__device__ __forceinline__ AddMat(const AddMat& other) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::AddMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
}
void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
}
template <typename T, typename D>
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
else
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
}
template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,148 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
{
S val;
explicit AddScalar(S val_) : val(val_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(a + val);
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::AddScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
AddScalar<T, S, D> op(static_cast<S>(val));
if (mask.data)
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
else
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,364 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
template <typename T> struct UseDouble_
{
enum {value = 0};
};
template <> struct UseDouble_<double>
{
enum {value = 1};
};
template <typename T1, typename T2, typename D> struct UseDouble
{
enum {value = (UseDouble_<T1>::value || UseDouble_<T2>::value || UseDouble_<D>::value)};
};
template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted_;
template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, false> : binary_function<T1, T2, D>
{
float alpha;
float beta;
float gamma;
AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
__device__ __forceinline__ D operator ()(T1 a, T2 b) const
{
return saturate_cast<D>(a * alpha + b * beta + gamma);
}
};
template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, true> : binary_function<T1, T2, D>
{
double alpha;
double beta;
double gamma;
AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
__device__ __forceinline__ D operator ()(T1 a, T2 b) const
{
return saturate_cast<D>(a * alpha + b * beta + gamma);
}
};
template <typename T1, typename T2, typename D> struct AddWeighted : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>
{
AddWeighted(double alpha_, double beta_, double gamma_) : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T1, typename T2, typename D, size_t src1_size, size_t src2_size, size_t dst_size> struct AddWeightedTraits : DefaultTransformFunctorTraits< arithm::AddWeighted<T1, T2, D> >
{
};
template <typename T1, typename T2, typename D, size_t src_size, size_t dst_size> struct AddWeightedTraits<T1, T2, D, src_size, src_size, dst_size> : arithm::ArithmFuncTraits<src_size, dst_size>
{
};
template <typename T1, typename T2, typename D> struct TransformFunctorTraits< arithm::AddWeighted<T1, T2, D> > : AddWeightedTraits<T1, T2, D, sizeof(T1), sizeof(T2), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T1, typename T2, typename D>
void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream)
{
AddWeighted<T1, T2, D> op(alpha, beta, gamma);
cudev::transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

View File

@@ -1,145 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __ARITHM_FUNC_TRAITS_HPP__
#define __ARITHM_FUNC_TRAITS_HPP__
#include <cstddef>
namespace arithm
{
template <size_t src_size, size_t dst_size> struct ArithmFuncTraits
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 1 };
};
template <> struct ArithmFuncTraits<1, 1>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<1, 2>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<1, 4>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<2, 1>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<2, 2>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<2, 4>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<4, 1>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<4, 2>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<4, 4>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
}
#endif // __ARITHM_FUNC_TRAITS_HPP__

View File

@@ -1,126 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct TransformFunctorTraits< bit_not<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< bit_and<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< bit_or<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< bit_xor<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
else
cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
}
template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
else
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
}
template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
else
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
}
template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
else
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
}
template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,104 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
}
template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
}
template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
}
template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,206 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
struct VCmpEq4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmpeq4(a, b);
}
__device__ __forceinline__ VCmpEq4() {}
__device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
};
struct VCmpNe4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmpne4(a, b);
}
__device__ __forceinline__ VCmpNe4() {}
__device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
};
struct VCmpLt4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmplt4(a, b);
}
__device__ __forceinline__ VCmpLt4() {}
__device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
};
struct VCmpLe4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmple4(a, b);
}
__device__ __forceinline__ VCmpLe4() {}
__device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
};
template <class Op, typename T>
struct Cmp : binary_function<T, T, uchar>
{
__device__ __forceinline__ uchar operator()(T a, T b) const
{
Op op;
return -op(a, b);
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
{
};
}}}
namespace arithm
{
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
}
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
}
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
}
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
}
template <template <typename> class Op, typename T>
void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
Cmp<Op<T>, T> op;
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
}
template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cmpMat<equal_to, T>(src1, src2, dst, stream);
}
template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cmpMat<not_equal_to, T>(src1, src2, dst, stream);
}
template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cmpMat<less, T>(src1, src2, dst, stream);
}
template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cmpMat<less_equal, T>(src1, src2, dst, stream);
}
template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,284 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
template <class Op, typename T>
struct Cmp : binary_function<T, T, uchar>
{
__device__ __forceinline__ uchar operator()(T a, T b) const
{
Op op;
return -op(a, b);
}
};
#define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
template <class Op, typename T, int cn> struct CmpScalar;
template <class Op, typename T>
struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
{
T val;
__host__ explicit CmpScalar(T val_) : val(val_) {}
__device__ __forceinline__ uchar operator()(T src) const
{
Cmp<Op, T> op;
return op(src, val);
}
};
template <class Op, typename T>
struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
{
TYPE_VEC(T, 2) val;
__host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
{
Cmp<Op, T> op;
return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
}
};
template <class Op, typename T>
struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
{
TYPE_VEC(T, 3) val;
__host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
{
Cmp<Op, T> op;
return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
}
};
template <class Op, typename T>
struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
{
TYPE_VEC(T, 4) val;
__host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
{
Cmp<Op, T> op;
return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
}
};
#undef TYPE_VEC
}
namespace cv { namespace gpu { namespace cudev
{
template <class Op, typename T> struct TransformFunctorTraits< arithm::CmpScalar<Op, T, 1> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
{
};
}}}
namespace arithm
{
template <template <typename> class Op, typename T, int cn>
void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type src_t;
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
src_t val1 = VecTraits<src_t>::make(sval);
CmpScalar<Op<T>, T, cn> op(val1);
cudev::transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
}
template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<equal_to, T, 1>,
cmpScalar<equal_to, T, 2>,
cmpScalar<equal_to, T, 3>,
cmpScalar<equal_to, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<not_equal_to, T, 1>,
cmpScalar<not_equal_to, T, 2>,
cmpScalar<not_equal_to, T, 3>,
cmpScalar<not_equal_to, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<less, T, 1>,
cmpScalar<less, T, 2>,
cmpScalar<less, T, 3>,
cmpScalar<less, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<less_equal, T, 1>,
cmpScalar<less_equal, T, 2>,
cmpScalar<less_equal, T, 3>,
cmpScalar<less_equal, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<greater, T, 1>,
cmpScalar<greater, T, 2>,
cmpScalar<greater, T, 3>,
cmpScalar<greater, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<greater_equal, T, 1>,
cmpScalar<greater_equal, T, 2>,
cmpScalar<greater_equal, T, 3>,
cmpScalar<greater_equal, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,131 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = src(y - top, x - left);
}
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
{
static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
const T* borderValue, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type vec_type;
typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
static const caller_t callers[5] =
{
CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
CopyMakeBorderDispatcher<BrdWrap, vec_type>::call,
CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call
};
callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
}
template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,175 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace countNonZero
{
__device__ unsigned int blocks_finished = 0;
template <int BLOCK_SIZE, typename T>
__global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
{
__shared__ unsigned int scount[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int mycount = 0;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
const T srcVal = ptr[x];
mycount += (srcVal != 0);
}
}
cudev::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
#if __CUDA_ARCH__ >= 200
if (tid == 0)
::atomicAdd(count, mycount);
#else
__shared__ bool is_last;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
if (tid == 0)
{
count[bid] = mycount;
__threadfence();
unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
cudev::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
if (tid == 0)
{
count[0] = mycount;
blocks_finished = 0;
}
}
#endif
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
}
template <typename T>
int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
{
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
unsigned int* count_buf = buf.ptr(0);
cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
return count;
}
template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<int >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
}
#endif // CUDA_DISABLER

View File

@@ -1,144 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
template <typename T, typename S, typename D> struct DivInv : unary_function<T, D>
{
S val;
explicit DivInv(S val_) : val(val_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return a != 0 ? saturate_cast<D>(val / a) : 0;
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
{
DivInv<T, S, D> op(static_cast<S>(val));
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,230 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
struct Div_8uc4_32f : binary_function<uint, float, uint>
{
__device__ __forceinline__ uint operator ()(uint a, float b) const
{
uint res = 0;
if (b != 0)
{
b = 1.0f / b;
res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
}
return res;
}
};
struct Div_16sc4_32f : binary_function<short4, float, short4>
{
__device__ __forceinline__ short4 operator ()(short4 a, float b) const
{
return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<short>(a.y / b),
saturate_cast<short>(a.z / b), saturate_cast<short>(a.w / b))
: make_short4(0,0,0,0);
}
};
template <typename T, typename D> struct Div : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return b != 0 ? saturate_cast<D>(a / b) : 0;
}
__device__ __forceinline__ Div() {}
__device__ __forceinline__ Div(const Div& other) {}
};
template <typename T> struct Div<T, float> : binary_function<T, T, float>
{
__device__ __forceinline__ float operator ()(T a, T b) const
{
return b != 0 ? static_cast<float>(a) / b : 0;
}
__device__ __forceinline__ Div() {}
__device__ __forceinline__ Div(const Div& other) {}
};
template <typename T> struct Div<T, double> : binary_function<T, T, double>
{
__device__ __forceinline__ double operator ()(T a, T b) const
{
return b != 0 ? static_cast<double>(a) / b : 0;
}
__device__ __forceinline__ Div() {}
__device__ __forceinline__ Div(const Div& other) {}
};
template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
{
S scale;
explicit DivScale(S scale_) : scale(scale_) {}
__device__ __forceinline__ D operator ()(T a, T b) const
{
return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits<arithm::Div_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::Div<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
}
void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
}
template <typename T, typename S, typename D>
void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
{
if (scale == 1)
{
Div<T, D> op;
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
else
{
DivScale<T, S, D> op(static_cast<S>(scale));
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
}
template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,144 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
template <typename T, typename S, typename D> struct DivScalar : unary_function<T, D>
{
S val;
explicit DivScalar(S val_) : val(val_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(a / val);
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
{
DivScalar<T, S, D> op(static_cast<S>(val));
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,472 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace imgproc
{
// Utility function to extract unsigned chars from an unsigned integer
__device__ uchar4 int_to_uchar4(unsigned int in)
{
uchar4 bytes;
bytes.x = (in & 0x000000ff) >> 0;
bytes.y = (in & 0x0000ff00) >> 8;
bytes.z = (in & 0x00ff0000) >> 16;
bytes.w = (in & 0xff000000) >> 24;
return bytes;
}
__global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
__shared__ int sums[128];
const int id = threadIdx.x;
const int lane_id = id % warpSize;
const int warp_id = id / warpSize;
const uint4 data = img(blockIdx.x, id);
const uchar4 a = int_to_uchar4(data.x);
const uchar4 b = int_to_uchar4(data.y);
const uchar4 c = int_to_uchar4(data.z);
const uchar4 d = int_to_uchar4(data.w);
int result[16];
result[0] = a.x;
result[1] = result[0] + a.y;
result[2] = result[1] + a.z;
result[3] = result[2] + a.w;
result[4] = result[3] + b.x;
result[5] = result[4] + b.y;
result[6] = result[5] + b.z;
result[7] = result[6] + b.w;
result[8] = result[7] + c.x;
result[9] = result[8] + c.y;
result[10] = result[9] + c.z;
result[11] = result[10] + c.w;
result[12] = result[11] + d.x;
result[13] = result[12] + d.y;
result[14] = result[13] + d.z;
result[15] = result[14] + d.w;
int sum = result[15];
// the prefix sum for each thread's 16 value is computed,
// now the final sums (result[15]) need to be shared
// with the other threads and add. To do this,
// the __shfl_up() instruction is used and a shuffle scan
// operation is performed to distribute the sums to the correct
// threads
#pragma unroll
for (int i = 1; i < 32; i *= 2)
{
const int n = __shfl_up(sum, i, 32);
if (lane_id >= i)
{
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] += n;
sum += n;
}
}
// Now the final sum for the warp must be shared
// between warps. This is done by each warp
// having a thread store to shared memory, then
// having some other warp load the values and
// compute a prefix sum, again by using __shfl_up.
// The results are uniformly added back to the warps.
// last thread in the warp holding sum of the warp
// places that in shared
if (threadIdx.x % warpSize == warpSize - 1)
sums[warp_id] = result[15];
__syncthreads();
if (warp_id == 0)
{
int warp_sum = sums[lane_id];
#pragma unroll
for (int i = 1; i <= 32; i *= 2)
{
const int n = __shfl_up(warp_sum, i, 32);
if (lane_id >= i)
warp_sum += n;
}
sums[lane_id] = warp_sum;
}
__syncthreads();
int blockSum = 0;
// fold in unused warp
if (warp_id > 0)
{
blockSum = sums[warp_id - 1];
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] += blockSum;
}
// assemble result
// Each thread has 16 values to write, which are
// now integer data (to avoid overflow). Instead of
// each thread writing consecutive uint4s, the
// approach shown here experiments using
// the shuffle command to reformat the data
// inside the registers so that each thread holds
// consecutive data to be written so larger contiguous
// segments can be assembled for writing.
/*
For example data that needs to be written as
GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
but is stored in registers (r0..r3), in four threads (0..3) as:
threadId 0 1 2 3
r0 x0 y0 z0 w0
r1 x1 y1 z1 w1
r2 x2 y2 z2 w2
r3 x3 y3 z3 w3
after apply __shfl_xor operations to move data between registers r1..r3:
threadId 00 01 10 11
x0 y0 z0 w0
xor(01)->y1 x1 w1 z1
xor(10)->z2 w2 x2 y2
xor(11)->w3 z3 y3 x3
and now x0..x3, and z0..z3 can be written out in order by all threads.
In the current code, each register above is actually representing
four integers to be written as uint4's to GMEM.
*/
result[4] = __shfl_xor(result[4] , 1, 32);
result[5] = __shfl_xor(result[5] , 1, 32);
result[6] = __shfl_xor(result[6] , 1, 32);
result[7] = __shfl_xor(result[7] , 1, 32);
result[8] = __shfl_xor(result[8] , 2, 32);
result[9] = __shfl_xor(result[9] , 2, 32);
result[10] = __shfl_xor(result[10], 2, 32);
result[11] = __shfl_xor(result[11], 2, 32);
result[12] = __shfl_xor(result[12], 3, 32);
result[13] = __shfl_xor(result[13], 3, 32);
result[14] = __shfl_xor(result[14], 3, 32);
result[15] = __shfl_xor(result[15], 3, 32);
uint4* integral_row = integral.ptr(blockIdx.x);
uint4 output;
///////
if (threadIdx.x % 4 == 0)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 2)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
///////
if (threadIdx.x % 4 == 2)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
// continuning from the above example,
// this use of __shfl_xor() places the y0..y3 and w0..w3 data
// in order.
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] = __shfl_xor(result[i], 1, 32);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 2)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
///////
if (threadIdx.x % 4 == 2)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
#endif
}
// This kernel computes columnwise prefix sums. When the data input is
// the row sums from above, this completes the integral image.
// The approach here is to have each block compute a local set of sums.
// First , the data covered by the block is loaded into shared memory,
// then instead of performing a sum in shared memory using __syncthreads
// between stages, the data is reformatted so that the necessary sums
// occur inside warps and the shuffle scan operation is used.
// The final set of sums from the block is then propgated, with the block
// computing "down" the image and adding the running sum to the local
// block sums.
__global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
__shared__ unsigned int sums[32][9];
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const int lane_id = tidx % 8;
if (tidx >= integral.cols)
return;
sums[threadIdx.x][threadIdx.y] = 0;
__syncthreads();
unsigned int stepSum = 0;
for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
{
unsigned int* p = integral.ptr(y) + tidx;
unsigned int sum = *p;
sums[threadIdx.x][threadIdx.y] = sum;
__syncthreads();
// place into SMEM
// shfl scan reduce the SMEM, reformating so the column
// sums are computed in a warp
// then read out properly
const int j = threadIdx.x % 8;
const int k = threadIdx.x / 8 + threadIdx.y * 4;
int partial_sum = sums[k][j];
for (int i = 1; i <= 8; i *= 2)
{
int n = __shfl_up(partial_sum, i, 32);
if (lane_id >= i)
partial_sum += n;
}
sums[k][j] = partial_sum;
__syncthreads();
if (threadIdx.y > 0)
sum += sums[threadIdx.x][threadIdx.y - 1];
sum += stepSum;
stepSum += sums[threadIdx.x][blockDim.y - 1];
__syncthreads();
*p = sum;
}
#endif
}
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
{
{
// each thread handles 16 values, use 1 block/row
// save, becouse step is actually can't be less 512 bytes
int block = integral.cols / 16;
// launch 1 block / row
const int grid = img.rows;
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
cudaSafeCall( cudaGetLastError() );
}
{
const dim3 block(32, 8);
const dim3 grid(divUp(integral.cols, block.x), 1);
shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
__shared__ unsigned int sums[32][9];
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const int lane_id = tidx % 8;
if (tidx >= integral.cols)
return;
sums[threadIdx.x][threadIdx.y] = 0;
__syncthreads();
unsigned int stepSum = 0;
for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
{
unsigned int* p = buffer.ptr(y) + tidx;
unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
unsigned int sum = *p;
sums[threadIdx.x][threadIdx.y] = sum;
__syncthreads();
// place into SMEM
// shfl scan reduce the SMEM, reformating so the column
// sums are computed in a warp
// then read out properly
const int j = threadIdx.x % 8;
const int k = threadIdx.x / 8 + threadIdx.y * 4;
int partial_sum = sums[k][j];
for (int i = 1; i <= 8; i *= 2)
{
int n = __shfl_up(partial_sum, i, 32);
if (lane_id >= i)
partial_sum += n;
}
sums[k][j] = partial_sum;
__syncthreads();
if (threadIdx.y > 0)
sum += sums[threadIdx.x][threadIdx.y - 1];
sum += stepSum;
stepSum += sums[threadIdx.x][blockDim.y - 1];
__syncthreads();
*dst = sum;
}
#endif
}
// used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
int blockStep, cudaStream_t stream)
{
{
const int block = blockStep;
const int grid = img.rows;
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
cudaSafeCall( cudaGetLastError() );
}
{
const dim3 block(32, 8);
const dim3 grid(divUp(integral.cols, block.x), 1);
shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
cudaSafeCall( cudaGetLastError() );
}
}
}
}}}
#endif /* CUDA_DISABLER */

View File

@@ -1,302 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/type_traits.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
//////////////////////////////////////////////////////////////////////////
// absMat
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct TransformFunctorTraits< abs_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
}
template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// sqrMat
namespace arithm
{
template <typename T> struct Sqr : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
return saturate_cast<T>(x * x);
}
__device__ __forceinline__ Sqr() {}
__device__ __forceinline__ Sqr(const Sqr& other) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct TransformFunctorTraits< arithm::Sqr<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
}
template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// sqrtMat
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct TransformFunctorTraits< sqrt_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
}
template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// logMat
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct TransformFunctorTraits< log_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
}
template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// expMat
namespace arithm
{
template <typename T> struct Exp : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
exp_func<T> f;
return saturate_cast<T>(f(x));
}
__device__ __forceinline__ Exp() {}
__device__ __forceinline__ Exp(const Exp& other) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct TransformFunctorTraits< arithm::Exp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
}
template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// pow
namespace arithm
{
template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
{
float power;
PowOp(double power_) : power(static_cast<float>(power_)) {}
__device__ __forceinline__ T operator()(T e) const
{
return saturate_cast<T>(__powf((float)e, power));
}
};
template<typename T> struct PowOp<T, true> : unary_function<T, T>
{
float power;
PowOp(double power_) : power(static_cast<float>(power_)) {}
__device__ __forceinline__ T operator()(T e) const
{
T res = saturate_cast<T>(__powf((float)e, power));
if ((e < 0) && (1 & static_cast<int>(power)))
res *= -1;
return res;
}
};
template<> struct PowOp<float> : unary_function<float, float>
{
float power;
PowOp(double power_) : power(static_cast<float>(power_)) {}
__device__ __forceinline__ float operator()(float e) const
{
return __powf(::fabs(e), power);
}
};
template<> struct PowOp<double> : unary_function<double, double>
{
double power;
PowOp(double power_) : power(power_) {}
__device__ __forceinline__ double operator()(double e) const
{
return ::pow(::fabs(e), power);
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct TransformFunctorTraits< arithm::PowOp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template<typename T>
void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
}
template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,246 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/utility.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace minMax
{
__device__ unsigned int blocks_finished = 0;
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits;
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
template <int BLOCK_SIZE, typename R>
struct GlobalReduce
{
static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
{
#if __CUDA_ARCH__ >= 200
if (tid == 0)
{
Emulation::glob::atomicMin(minval, mymin);
Emulation::glob::atomicMax(maxval, mymax);
}
#else
__shared__ bool is_last;
if (tid == 0)
{
minval[bid] = mymin;
maxval[bid] = mymax;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
int idx = ::min(tid, gridDim.x * gridDim.y - 1);
mymin = minval[idx];
mymax = maxval[idx];
const minimum<R> minOp;
const maximum<R> maxOp;
cudev::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
if (tid == 0)
{
minval[0] = mymin;
maxval[0] = mymax;
blocks_finished = 0;
}
}
#endif
}
};
template <int BLOCK_SIZE, typename T, typename R, class Mask>
__global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
{
__shared__ R sminval[BLOCK_SIZE];
__shared__ R smaxval[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
R mymin = numeric_limits<R>::max();
R mymax = -numeric_limits<R>::max();
const minimum<R> minOp;
const maximum<R> maxOp;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const R srcVal = ptr[x];
mymin = minOp(mymin, srcVal);
mymax = maxOp(mymax, srcVal);
}
}
}
cudev::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
bufcols = grid.x * grid.y * sizeof(double);
bufrows = 2;
}
__global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
{
*minval_buf = numeric_limits<int>::max();
*maxval_buf = numeric_limits<int>::min();
}
__global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
{
*minval_buf = numeric_limits<float>::max();
*maxval_buf = -numeric_limits<float>::max();
}
__global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
{
*minval_buf = numeric_limits<double>::max();
*maxval_buf = -numeric_limits<double>::max();
}
template <typename R>
void setDefault(R* minval_buf, R* maxval_buf)
{
setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
}
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
{
typedef typename MinMaxTypeTraits<T>::best_type R;
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
R* minval_buf = (R*) buf.ptr(0);
R* maxval_buf = (R*) buf.ptr(1);
setDefault(minval_buf, maxval_buf);
if (mask.data)
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
else
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
R minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
}
template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
}
#endif // CUDA_DISABLER

View File

@@ -1,228 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
//////////////////////////////////////////////////////////////////////////
// min
namespace arithm
{
struct VMin4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmin4(a, b);
}
__device__ __forceinline__ VMin4() {}
__device__ __forceinline__ VMin4(const VMin4& other) {}
};
struct VMin2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmin2(a, b);
}
__device__ __forceinline__ VMin2() {}
__device__ __forceinline__ VMin2(const VMin2& other) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T> struct TransformFunctorTraits< minimum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
}
void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
}
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
}
template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
}
template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// max
namespace arithm
{
struct VMax4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmax4(a, b);
}
__device__ __forceinline__ VMax4() {}
__device__ __forceinline__ VMax4(const VMax4& other) {}
};
struct VMax2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmax2(a, b);
}
__device__ __forceinline__ VMax2() {}
__device__ __forceinline__ VMax2(const VMax2& other) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T> struct TransformFunctorTraits< maximum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
}
void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
}
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
}
template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
{
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::cudev::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
}
template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,235 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/utility.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace minMaxLoc
{
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits;
template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
template <int BLOCK_SIZE, typename T, class Mask>
__global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
__shared__ work_type sminval[BLOCK_SIZE];
__shared__ work_type smaxval[BLOCK_SIZE];
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
work_type mymin = numeric_limits<work_type>::max();
work_type mymax = -numeric_limits<work_type>::max();
unsigned int myminloc = 0;
unsigned int mymaxloc = 0;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const work_type srcVal = ptr[x];
if (srcVal < mymin)
{
mymin = srcVal;
myminloc = y * src.cols + x;
}
if (srcVal > mymax)
{
mymax = srcVal;
mymaxloc = y * src.cols + x;
}
}
}
}
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
tid,
thrust::make_tuple(less<work_type>(), greater<work_type>()));
if (tid == 0)
{
minval[bid] = (T) mymin;
maxval[bid] = (T) mymax;
minloc[bid] = myminloc;
maxloc[bid] = mymaxloc;
}
}
template <int BLOCK_SIZE, typename T>
__global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
__shared__ work_type sminval[BLOCK_SIZE];
__shared__ work_type smaxval[BLOCK_SIZE];
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
unsigned int idx = ::min(threadIdx.x, count - 1);
work_type mymin = minval[idx];
work_type mymax = maxval[idx];
unsigned int myminloc = minloc[idx];
unsigned int mymaxloc = maxloc[idx];
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
threadIdx.x,
thrust::make_tuple(less<work_type>(), greater<work_type>()));
if (threadIdx.x == 0)
{
minval[0] = (T) mymin;
maxval[0] = (T) mymax;
minloc[0] = myminloc;
maxloc[0] = mymaxloc;
}
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
// For values
b1cols = (int)(grid.x * grid.y * elem_size);
b1rows = 2;
// For locations
b2cols = grid.x * grid.y * sizeof(int);
b2rows = 2;
}
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
{
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
T* minval_buf = (T*) valbuf.ptr(0);
T* maxval_buf = (T*) valbuf.ptr(1);
unsigned int* minloc_buf = locbuf.ptr(0);
unsigned int* maxloc_buf = locbuf.ptr(1);
if (mask.data)
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
else
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
unsigned int minloc_, maxloc_;
cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
}
#endif // CUDA_DISABLER

View File

@@ -1,211 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
struct Mul_8uc4_32f : binary_function<uint, float, uint>
{
__device__ __forceinline__ uint operator ()(uint a, float b) const
{
uint res = 0;
res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
return res;
}
__device__ __forceinline__ Mul_8uc4_32f() {}
__device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
};
struct Mul_16sc4_32f : binary_function<short4, float, short4>
{
__device__ __forceinline__ short4 operator ()(short4 a, float b) const
{
return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
}
__device__ __forceinline__ Mul_16sc4_32f() {}
__device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
};
template <typename T, typename D> struct Mul : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(a * b);
}
__device__ __forceinline__ Mul() {}
__device__ __forceinline__ Mul(const Mul& other) {}
};
template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
{
S scale;
explicit MulScale(S scale_) : scale(scale_) {}
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(scale * a * b);
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits<arithm::Mul_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::Mul<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
}
void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
}
template <typename T, typename S, typename D>
void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
{
if (scale == 1)
{
Mul<T, D> op;
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
else
{
MulScale<T, S, D> op(static_cast<S>(scale));
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
}
template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,144 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
template <typename T, typename S, typename D> struct MulScalar : unary_function<T, D>
{
S val;
explicit MulScalar(S val_) : val(val_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(a * val);
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
{
MulScalar<T, S, D> op(static_cast<S>(val));
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,171 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "cvconfig.h"
#ifdef HAVE_CUFFT
#include <cufft.h>
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace gpu { namespace cudev
{
//////////////////////////////////////////////////////////////////////////
// mulSpectrums
__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows)
{
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
}
}
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
// mulSpectrums_CONJ
__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows)
{
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
}
}
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums
__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows)
{
cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
}
}
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
if (stream)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums_CONJ
__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows)
{
cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
}
}
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}}} // namespace cv { namespace gpu { namespace cudev
#endif // HAVE_CUFFT
#endif /* CUDA_DISABLER */

View File

@@ -1,217 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace mathfunc
{
//////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar
struct Nothing
{
static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
{
}
};
struct Magnitude
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
}
};
struct MagnitudeSqr
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
}
};
struct Atan2
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
{
float angle = ::atan2f(y_data, x_data);
angle += (angle < 0) * 2.0f * CV_PI_F;
dst[y * dst_step + x] = scale * angle;
}
};
template <typename Mag, typename Angle>
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
float x_data = xptr[y * x_step + x];
float y_data = yptr[y * y_step + x];
Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
}
}
struct NonEmptyMag
{
static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
{
return mag[y * mag_step + x];
}
};
struct EmptyMag
{
static __device__ __forceinline__ float get(const float*, size_t, int, int)
{
return 1.0f;
}
};
template <typename Mag>
__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
float mag_data = Mag::get(mag, mag_step, x, y);
float angle_data = angle[y * angle_step + x];
float sin_a, cos_a;
::sincosf(scale * angle_data, &sin_a, &cos_a);
xptr[y * x_step + x] = mag_data * cos_a;
yptr[y * y_step + x] = mag_data * sin_a;
}
}
template <typename Mag, typename Angle>
void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(x.cols, threads.x);
grid.y = divUp(x.rows, threads.y);
const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2][2][2] =
{
{
{
cartToPolar_caller<Magnitude, Atan2>,
cartToPolar_caller<Magnitude, Nothing>
},
{
cartToPolar_caller<MagnitudeSqr, Atan2>,
cartToPolar_caller<MagnitudeSqr, Nothing>,
}
},
{
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>
},
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>,
}
}
};
callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
}
template <typename Mag>
void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(mag.cols, threads.x);
grid.y = divUp(mag.rows, threads.y);
const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] =
{
polarToCart_caller<NonEmptyMag>,
polarToCart_caller<EmptyMag>
};
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
}
} // namespace mathfunc
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,330 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "unroll_detail.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace reduce
{
struct Sum
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(0);
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
return a + b;
}
template <typename T>
__device__ __forceinline__ T result(T r, double) const
{
return r;
}
__device__ __forceinline__ Sum() {}
__device__ __forceinline__ Sum(const Sum&) {}
};
struct Avg
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(0);
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
return a + b;
}
template <typename T>
__device__ __forceinline__ typename TypeVec<double, VecTraits<T>::cn>::vec_type result(T r, double sz) const
{
return r / sz;
}
__device__ __forceinline__ Avg() {}
__device__ __forceinline__ Avg(const Avg&) {}
};
struct Min
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
minimum<T> minOp;
return minOp(a, b);
}
template <typename T>
__device__ __forceinline__ T result(T r, double) const
{
return r;
}
__device__ __forceinline__ Min() {}
__device__ __forceinline__ Min(const Min&) {}
};
struct Max
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
maximum<T> maxOp;
return maxOp(a, b);
}
template <typename T>
__device__ __forceinline__ T result(T r, double) const
{
return r;
}
__device__ __forceinline__ Max() {}
__device__ __forceinline__ Max(const Max&) {}
};
///////////////////////////////////////////////////////////
template <typename T, typename S, typename D, class Op>
__global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
{
__shared__ S smem[16 * 16];
const int x = blockIdx.x * 16 + threadIdx.x;
S myVal = op.template startValue<S>();
if (x < src.cols)
{
for (int y = threadIdx.y; y < src.rows; y += 16)
{
S srcVal = src(y, x);
myVal = op(myVal, srcVal);
}
}
smem[threadIdx.x * 16 + threadIdx.y] = myVal;
__syncthreads();
volatile S* srow = smem + threadIdx.y * 16;
myVal = srow[threadIdx.x];
cudev::reduce<16>(srow, myVal, threadIdx.x, op);
if (threadIdx.x == 0)
srow[0] = myVal;
__syncthreads();
if (threadIdx.y == 0 && x < src.cols)
dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
}
template <typename T, typename S, typename D, class Op>
void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(src.cols, block.x));
Op op;
rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename S, typename D>
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
static const func_t funcs[] =
{
rowsCaller<T, S, D, Sum>,
rowsCaller<T, S, D, Avg>,
rowsCaller<T, S, D, Max>,
rowsCaller<T, S, D, Min>
};
funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
}
template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
///////////////////////////////////////////////////////////
template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
__global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
{
typedef typename TypeVec<T, cn>::vec_type src_type;
typedef typename TypeVec<S, cn>::vec_type work_type;
typedef typename TypeVec<D, cn>::vec_type dst_type;
__shared__ S smem[BLOCK_SIZE * cn];
const int y = blockIdx.x;
const src_type* srcRow = src.ptr(y);
work_type myVal = op.template startValue<work_type>();
for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
cudev::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
if (threadIdx.x == 0)
dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
}
template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
{
const int BLOCK_SIZE = 256;
const dim3 block(BLOCK_SIZE);
const dim3 grid(src.rows);
Op op;
colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
static const func_t funcs[5][4] =
{
{0,0,0,0},
{colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
{colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
{colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
{colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
};
funcs[cn][op](src, dst, stream);
}
template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

View File

@@ -1,511 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace gpu { namespace cudev
{
namespace split_merge
{
template <typename T, size_t elem_size = sizeof(T)>
struct TypeTraits
{
typedef T type;
typedef T type2;
typedef T type3;
typedef T type4;
};
template <typename T>
struct TypeTraits<T, 1>
{
typedef char type;
typedef char2 type2;
typedef char3 type3;
typedef char4 type4;
};
template <typename T>
struct TypeTraits<T, 2>
{
typedef short type;
typedef short2 type2;
typedef short3 type3;
typedef short4 type4;
};
template <typename T>
struct TypeTraits<T, 4>
{
typedef int type;
typedef int2 type2;
typedef int3 type3;
typedef int4 type4;
};
template <typename T>
struct TypeTraits<T, 8>
{
typedef double type;
typedef double2 type2;
//typedef double3 type3;
//typedef double4 type3;
};
typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);
//------------------------------------------------------------
// Merge
template <typename T>
__global__ void mergeC2_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type2 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_y[x] = dst_elem;
}
}
template <typename T>
__global__ void mergeC3_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type3 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_y[x] = dst_elem;
}
}
template <>
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
double* dst_y = (double*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[3 * x] = src0_y[x];
dst_y[3 * x + 1] = src1_y[x];
dst_y[3 * x + 2] = src2_y[x];
}
}
template <typename T>
__global__ void mergeC4_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type4 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
const T* src3_y = (const T*)(src3 + y * src3_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_elem.w = src3_y[x];
dst_y[x] = dst_elem;
}
}
template <>
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
const double* src3_y = (const double*)(src3 + y * src3_step);
double2* dst_y = (double2*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
}
}
template <typename T>
static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC2_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC3_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC4_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
src[3].data, src[3].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst,
int total_channels, size_t elem_size,
const cudaStream_t& stream)
{
static MergeFunction merge_func_tbl[] =
{
mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
};
size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
MergeFunction merge_func = merge_func_tbl[merge_func_id];
if (merge_func == 0)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
merge_func(src, dst, stream);
}
//------------------------------------------------------------
// Split
template <typename T>
__global__ void splitC2_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step)
{
typedef typename TypeTraits<T>::type2 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
}
}
template <typename T>
__global__ void splitC3_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
{
typedef typename TypeTraits<T>::type3 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
}
}
template <>
__global__ void splitC3_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src_y = (const double*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
if (x < cols && y < rows)
{
dst0_y[x] = src_y[3 * x];
dst1_y[x] = src_y[3 * x + 1];
dst2_y[x] = src_y[3 * x + 2];
}
}
template <typename T>
__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
{
typedef typename TypeTraits<T>::type4 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
T* dst3_y = (T*)(dst3 + y * dst3_step);
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
dst3_y[x] = src_elem.w;
}
}
template <>
__global__ void splitC4_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double2* src_y = (const double2*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
double* dst3_y = (double*)(dst3 + y * dst3_step);
if (x < cols && y < rows)
{
double2 src_elem1 = src_y[2 * x];
double2 src_elem2 = src_y[2 * x + 1];
dst0_y[x] = src_elem1.x;
dst1_y[x] = src_elem1.y;
dst2_y[x] = src_elem2.x;
dst3_y[x] = src_elem2.y;
}
}
template <typename T>
static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC2_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC3_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC4_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step,
dst[3].data, dst[3].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
{
static SplitFunction split_func_tbl[] =
{
splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
};
size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
SplitFunction split_func = split_func_tbl[split_func_id];
if (split_func == 0)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
split_func(src, dst, stream);
}
} // namespace split_merge
}}} // namespace cv { namespace gpu { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -1,185 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
struct VSub4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vsub4(a, b);
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4& other) {}
};
struct VSub2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vsub2(a, b);
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2& other) {}
};
template <typename T, typename D> struct SubMat : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(a - b);
}
__device__ __forceinline__ SubMat() {}
__device__ __forceinline__ SubMat(const SubMat& other) {}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::SubMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
}
void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
cudev::transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
}
template <typename T, typename D>
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
else
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
}
template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,148 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
template <typename T, typename S, typename D> struct SubScalar : unary_function<T, D>
{
S val;
explicit SubScalar(S val_) : val(val_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(a - val);
}
};
}
namespace cv { namespace gpu { namespace cudev
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::SubScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
SubScalar<T, S, D> op(static_cast<S>(val));
if (mask.data)
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
else
cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,380 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/utility.hpp"
#include "unroll_detail.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace sum
{
__device__ unsigned int blocks_finished = 0;
template <typename R, int cn> struct AtomicAdd;
template <typename R> struct AtomicAdd<R, 1>
{
static __device__ void run(R* ptr, R val)
{
Emulation::glob::atomicAdd(ptr, val);
}
};
template <typename R> struct AtomicAdd<R, 2>
{
typedef typename TypeVec<R, 2>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
}
};
template <typename R> struct AtomicAdd<R, 3>
{
typedef typename TypeVec<R, 3>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
}
};
template <typename R> struct AtomicAdd<R, 4>
{
typedef typename TypeVec<R, 4>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
Emulation::glob::atomicAdd(ptr + 3, val.w);
}
};
template <int BLOCK_SIZE, typename R, int cn>
struct GlobalReduce
{
typedef typename TypeVec<R, cn>::vec_type result_type;
static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
{
#if __CUDA_ARCH__ >= 200
if (tid == 0)
AtomicAdd<R, cn>::run((R*) result, sum);
#else
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = sum;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
cudev::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
if (tid == 0)
{
result[0] = sum;
blocks_finished = 0;
}
}
#endif
}
};
template <int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op>
__global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight)
{
typedef typename VecTraits<src_type>::elem_type T;
typedef typename VecTraits<result_type>::elem_type R;
const int cn = VecTraits<src_type>::cn;
__shared__ R smem[BLOCK_SIZE * cn];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
result_type sum = VecTraits<result_type>::all(0);
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const src_type* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const src_type srcVal = ptr[x];
sum = sum + op(saturate_cast<result_type>(srcVal));
}
}
}
cudev::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
}
template <typename T, typename R, int cn, template <typename> class Op>
void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
{
typedef typename TypeVec<T, cn>::vec_type src_type;
typedef typename TypeVec<R, cn>::vec_type result_type;
PtrStepSz<src_type> src(src_);
result_type* buf = (result_type*) buf_;
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
Op<result_type> op;
if (mask.data)
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
else
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0};
cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
out[0] = result[0];
out[1] = result[1];
out[2] = result[2];
out[3] = result[3];
}
template <typename T> struct SumType;
template <> struct SumType<uchar> { typedef unsigned int R; };
template <> struct SumType<schar> { typedef int R; };
template <> struct SumType<ushort> { typedef unsigned int R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename T, int cn>
void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, identity>(src, buf, out, mask);
}
template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T, int cn>
void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, abs_func>(src, buf, out, mask);
}
template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T> struct Sqr : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
return x * x;
}
};
template <typename T, int cn>
void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
caller<T, double, cn, Sqr>(src, buf, out, mask);
}
template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
}
#endif // CUDA_DISABLER

View File

@@ -1,114 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace cv { namespace gpu { namespace cudev
{
template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <template <typename> class Op, typename T>
void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream)
{
Op<T> op(thresh, maxVal);
cudev::transform(src, dst, op, WithOutMask(), stream);
}
template <typename T>
void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream);
static const caller_t callers[] =
{
threshold_caller<thresh_binary_func, T>,
threshold_caller<thresh_binary_inv_func, T>,
threshold_caller<thresh_trunc_func, T>,
threshold_caller<thresh_to_zero_func, T>,
threshold_caller<thresh_to_zero_inv_func, T>
};
callers[type]((PtrStepSz<T>) src, (PtrStepSz<T>) dst, static_cast<T>(thresh), static_cast<T>(maxVal), stream);
}
template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,122 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
using namespace cv::gpu;
using namespace cv::gpu::cudev;
namespace arithm
{
const int TRANSPOSE_TILE_DIM = 16;
const int TRANSPOSE_BLOCK_ROWS = 16;
template <typename T>
__global__ void transposeKernel(const PtrStepSz<T> src, PtrStep<T> dst)
{
__shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
int blockIdx_x, blockIdx_y;
// do diagonal reordering
if (gridDim.x == gridDim.y)
{
blockIdx_y = blockIdx.x;
blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
}
else
{
int bid = blockIdx.x + gridDim.x * blockIdx.y;
blockIdx_y = bid % gridDim.y;
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
}
int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
if (xIndex < src.cols)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < src.rows)
{
tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex);
}
}
}
__syncthreads();
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
if (xIndex < src.rows)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < src.cols)
{
dst(yIndex + i, xIndex) = tile[threadIdx.x][threadIdx.y + i];
}
}
}
}
template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
{
const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
transposeKernel<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void transpose<int>(PtrStepSz<int> src, PtrStepSz<int> dst, cudaStream_t stream);
template void transpose<double>(PtrStepSz<double> src, PtrStepSz<double> dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -1,135 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __UNROLL_DETAIL_HPP__
#define __UNROLL_DETAIL_HPP__
#include <thrust/tuple.h>
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
namespace detail
{
template <int cn> struct Unroll;
template <> struct Unroll<1>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
{
return smem;
}
template <typename R>
static __device__ __forceinline__ R& tie(R& val)
{
return val;
}
template <class Op>
static __device__ __forceinline__ const Op& op(const Op& op)
{
return op;
}
};
template <> struct Unroll<2>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
{
return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE);
}
template <typename R>
static __device__ __forceinline__ thrust::tuple<typename cv::gpu::cudev::VecTraits<R>::elem_type&, typename cv::gpu::cudev::VecTraits<R>::elem_type&> tie(R& val)
{
return thrust::tie(val.x, val.y);
}
template <class Op>
static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
{
return thrust::make_tuple(op, op);
}
};
template <> struct Unroll<3>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
{
return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
}
template <typename R>
static __device__ __forceinline__ thrust::tuple<typename cv::gpu::cudev::VecTraits<R>::elem_type&, typename cv::gpu::cudev::VecTraits<R>::elem_type&, typename cv::gpu::cudev::VecTraits<R>::elem_type&> tie(R& val)
{
return thrust::tie(val.x, val.y, val.z);
}
template <class Op>
static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
{
return thrust::make_tuple(op, op, op);
}
};
template <> struct Unroll<4>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
{
return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
}
template <typename R>
static __device__ __forceinline__ thrust::tuple<typename cv::gpu::cudev::VecTraits<R>::elem_type&, typename cv::gpu::cudev::VecTraits<R>::elem_type&, typename cv::gpu::cudev::VecTraits<R>::elem_type&, typename cv::gpu::cudev::VecTraits<R>::elem_type&> tie(R& val)
{
return thrust::tie(val.x, val.y, val.z, val.w);
}
template <class Op>
static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
{
return thrust::make_tuple(op, op, op, op);
}
};
}
#endif // __UNROLL_DETAIL_HPP__

File diff suppressed because it is too large Load Diff

View File

@@ -1,43 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"

View File

@@ -1,70 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_PRECOMP_H__
#define __OPENCV_PRECOMP_H__
#include <limits>
#include "cvconfig.h"
#include "opencv2/gpuarithm.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/core/gpu_private.hpp"
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_GPULEGACY
# include "opencv2/gpulegacy.hpp"
# include "opencv2/gpulegacy/private.hpp"
#endif
#ifdef HAVE_CUBLAS
# include <cublas.h>
#endif
#ifdef HAVE_CUFFT
# include <cufft.h>
#endif
#endif /* __OPENCV_PRECOMP_H__ */

View File

@@ -1,784 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
double cv::gpu::norm(const GpuMat&, int) { throw_no_cuda(); return 0.0; }
double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_no_cuda(); return 0.0; }
double cv::gpu::norm(const GpuMat&, int, const GpuMat&, GpuMat&) { throw_no_cuda(); return 0.0; }
double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_no_cuda(); return 0.0; }
Scalar cv::gpu::sum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
Scalar cv::gpu::sum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
Scalar cv::gpu::absSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
Scalar cv::gpu::absSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
Scalar cv::gpu::sqrSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
Scalar cv::gpu::sqrSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_no_cuda(); }
void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_no_cuda(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_no_cuda(); }
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
int cv::gpu::countNonZero(const GpuMat&) { throw_no_cuda(); return 0; }
int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_no_cuda(); return 0; }
void cv::gpu::reduce(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_no_cuda(); }
void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_no_cuda(); }
void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
#else
namespace
{
class DeviceBuffer
{
public:
explicit DeviceBuffer(int count_ = 1) : count(count_)
{
cudaSafeCall( cudaMalloc(&pdev, count * sizeof(double)) );
}
~DeviceBuffer()
{
cudaSafeCall( cudaFree(pdev) );
}
operator double*() {return pdev;}
void download(double* hptr)
{
double hbuf;
cudaSafeCall( cudaMemcpy(&hbuf, pdev, sizeof(double), cudaMemcpyDeviceToHost) );
*hptr = hbuf;
}
void download(double** hptrs)
{
AutoBuffer<double, 2 * sizeof(double)> hbuf(count);
cudaSafeCall( cudaMemcpy((void*)hbuf, pdev, count * sizeof(double), cudaMemcpyDeviceToHost) );
for (int i = 0; i < count; ++i)
*hptrs[i] = hbuf[i];
}
private:
double* pdev;
int count;
};
}
////////////////////////////////////////////////////////////////////////
// norm
double cv::gpu::norm(const GpuMat& src, int normType)
{
GpuMat buf;
return gpu::norm(src, normType, GpuMat(), buf);
}
double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
{
return gpu::norm(src, normType, GpuMat(), buf);
}
double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat& buf)
{
CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1));
GpuMat src_single_channel = src.reshape(1);
if (normType == NORM_L1)
return gpu::absSum(src_single_channel, mask, buf)[0];
if (normType == NORM_L2)
return std::sqrt(gpu::sqrSum(src_single_channel, mask, buf)[0]);
// NORM_INF
double min_val, max_val;
gpu::minMax(src_single_channel, &min_val, &max_val, mask, buf);
return std::max(std::abs(min_val), std::abs(max_val));
}
double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
{
CV_Assert(src1.type() == CV_8UC1);
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
NppiSize oSizeROI, Npp64f* pRetVal);
static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
NppiSize sz;
sz.width = src1.cols;
sz.height = src1.rows;
int funcIdx = normType >> 1;
double retVal;
DeviceBuffer dbuf;
nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
cudaSafeCall( cudaDeviceSynchronize() );
dbuf.download(&retVal);
return retVal;
}
////////////////////////////////////////////////////////////////////////
// Sum
namespace sum
{
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
template <typename T, int cn>
void run(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
template <typename T, int cn>
void runAbs(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
template <typename T, int cn>
void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
}
Scalar cv::gpu::sum(const GpuMat& src)
{
GpuMat buf;
return gpu::sum(src, GpuMat(), buf);
}
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
{
return gpu::sum(src, GpuMat(), buf);
}
Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
{
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
static const func_t funcs[7][5] =
{
{0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
{0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
{0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
{0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
{0, ::sum::run<int , 1>, ::sum::run<int , 2>, ::sum::run<int , 3>, ::sum::run<int , 4>},
{0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
{0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
};
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
const func_t func = funcs[src.depth()][src.channels()];
double result[4];
func(src, buf.data, result, mask);
return Scalar(result[0], result[1], result[2], result[3]);
}
Scalar cv::gpu::absSum(const GpuMat& src)
{
GpuMat buf;
return gpu::absSum(src, GpuMat(), buf);
}
Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
{
return gpu::absSum(src, GpuMat(), buf);
}
Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
{
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
static const func_t funcs[7][5] =
{
{0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
{0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
{0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
{0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
{0, ::sum::runAbs<int , 1>, ::sum::runAbs<int , 2>, ::sum::runAbs<int , 3>, ::sum::runAbs<int , 4>},
{0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
{0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
};
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
const func_t func = funcs[src.depth()][src.channels()];
double result[4];
func(src, buf.data, result, mask);
return Scalar(result[0], result[1], result[2], result[3]);
}
Scalar cv::gpu::sqrSum(const GpuMat& src)
{
GpuMat buf;
return gpu::sqrSum(src, GpuMat(), buf);
}
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
{
return gpu::sqrSum(src, GpuMat(), buf);
}
Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
{
typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
static const func_t funcs[7][5] =
{
{0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
{0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
{0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
{0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
{0, ::sum::runSqr<int , 1>, ::sum::runSqr<int , 2>, ::sum::runSqr<int , 3>, ::sum::runSqr<int , 4>},
{0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
{0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
};
CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
buf.setTo(Scalar::all(0));
const func_t func = funcs[src.depth()][src.channels()];
double result[4];
func(src, buf.data, result, mask);
return Scalar(result[0], result[1], result[2], result[3]);
}
////////////////////////////////////////////////////////////////////////
// minMax
namespace minMax
{
void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
{
GpuMat buf;
gpu::minMax(src, minVal, maxVal, mask, buf);
}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
{
typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
static const func_t funcs[] =
{
::minMax::run<uchar>,
::minMax::run<schar>,
::minMax::run<ushort>,
::minMax::run<short>,
::minMax::run<int>,
::minMax::run<float>,
::minMax::run<double>
};
CV_Assert( src.channels() == 1 );
CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
const func_t func = funcs[src.depth()];
double temp1, temp2;
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
}
////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace minMaxLoc
{
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
}
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
{
GpuMat valBuf, locBuf;
gpu::minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
}
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
{
typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
static const func_t funcs[] =
{
::minMaxLoc::run<uchar>,
::minMaxLoc::run<schar>,
::minMaxLoc::run<ushort>,
::minMaxLoc::run<short>,
::minMaxLoc::run<int>,
::minMaxLoc::run<float>,
::minMaxLoc::run<double>
};
CV_Assert( src.channels() == 1 );
CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size valbuf_size, locbuf_size;
::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
const func_t func = funcs[src.depth()];
double temp1, temp2;
Point temp3, temp4;
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
}
//////////////////////////////////////////////////////////////////////////////
// countNonZero
namespace countNonZero
{
void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
template <typename T>
int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
}
int cv::gpu::countNonZero(const GpuMat& src)
{
GpuMat buf;
return countNonZero(src, buf);
}
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
{
typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
static const func_t funcs[] =
{
::countNonZero::run<uchar>,
::countNonZero::run<schar>,
::countNonZero::run<ushort>,
::countNonZero::run<short>,
::countNonZero::run<int>,
::countNonZero::run<float>,
::countNonZero::run<double>
};
CV_Assert(src.channels() == 1);
if (src.depth() == CV_64F)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
}
Size buf_size;
::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
const func_t func = funcs[src.depth()];
return func(src, buf);
}
//////////////////////////////////////////////////////////////////////////////
// reduce
namespace reduce
{
template <typename T, typename S, typename D>
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template <typename T, typename S, typename D>
void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
}
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
{
CV_Assert( src.channels() <= 4 );
CV_Assert( dim == 0 || dim == 1 );
CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
if (dtype < 0)
dtype = src.depth();
dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
if (dim == 0)
{
typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
static const func_t funcs[7][7] =
{
{
::reduce::rows<unsigned char, int, unsigned char>,
0/*::reduce::rows<unsigned char, int, signed char>*/,
0/*::reduce::rows<unsigned char, int, unsigned short>*/,
0/*::reduce::rows<unsigned char, int, short>*/,
::reduce::rows<unsigned char, int, int>,
::reduce::rows<unsigned char, float, float>,
::reduce::rows<unsigned char, double, double>
},
{
0/*::reduce::rows<signed char, int, unsigned char>*/,
0/*::reduce::rows<signed char, int, signed char>*/,
0/*::reduce::rows<signed char, int, unsigned short>*/,
0/*::reduce::rows<signed char, int, short>*/,
0/*::reduce::rows<signed char, int, int>*/,
0/*::reduce::rows<signed char, float, float>*/,
0/*::reduce::rows<signed char, double, double>*/
},
{
0/*::reduce::rows<unsigned short, int, unsigned char>*/,
0/*::reduce::rows<unsigned short, int, signed char>*/,
::reduce::rows<unsigned short, int, unsigned short>,
0/*::reduce::rows<unsigned short, int, short>*/,
::reduce::rows<unsigned short, int, int>,
::reduce::rows<unsigned short, float, float>,
::reduce::rows<unsigned short, double, double>
},
{
0/*::reduce::rows<short, int, unsigned char>*/,
0/*::reduce::rows<short, int, signed char>*/,
0/*::reduce::rows<short, int, unsigned short>*/,
::reduce::rows<short, int, short>,
::reduce::rows<short, int, int>,
::reduce::rows<short, float, float>,
::reduce::rows<short, double, double>
},
{
0/*::reduce::rows<int, int, unsigned char>*/,
0/*::reduce::rows<int, int, signed char>*/,
0/*::reduce::rows<int, int, unsigned short>*/,
0/*::reduce::rows<int, int, short>*/,
::reduce::rows<int, int, int>,
::reduce::rows<int, float, float>,
::reduce::rows<int, double, double>
},
{
0/*::reduce::rows<float, float, unsigned char>*/,
0/*::reduce::rows<float, float, signed char>*/,
0/*::reduce::rows<float, float, unsigned short>*/,
0/*::reduce::rows<float, float, short>*/,
0/*::reduce::rows<float, float, int>*/,
::reduce::rows<float, float, float>,
::reduce::rows<float, double, double>
},
{
0/*::reduce::rows<double, double, unsigned char>*/,
0/*::reduce::rows<double, double, signed char>*/,
0/*::reduce::rows<double, double, unsigned short>*/,
0/*::reduce::rows<double, double, short>*/,
0/*::reduce::rows<double, double, int>*/,
0/*::reduce::rows<double, double, float>*/,
::reduce::rows<double, double, double>
}
};
const func_t func = funcs[src.depth()][dst.depth()];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
}
else
{
typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
static const func_t funcs[7][7] =
{
{
::reduce::cols<unsigned char, int, unsigned char>,
0/*::reduce::cols<unsigned char, int, signed char>*/,
0/*::reduce::cols<unsigned char, int, unsigned short>*/,
0/*::reduce::cols<unsigned char, int, short>*/,
::reduce::cols<unsigned char, int, int>,
::reduce::cols<unsigned char, float, float>,
::reduce::cols<unsigned char, double, double>
},
{
0/*::reduce::cols<signed char, int, unsigned char>*/,
0/*::reduce::cols<signed char, int, signed char>*/,
0/*::reduce::cols<signed char, int, unsigned short>*/,
0/*::reduce::cols<signed char, int, short>*/,
0/*::reduce::cols<signed char, int, int>*/,
0/*::reduce::cols<signed char, float, float>*/,
0/*::reduce::cols<signed char, double, double>*/
},
{
0/*::reduce::cols<unsigned short, int, unsigned char>*/,
0/*::reduce::cols<unsigned short, int, signed char>*/,
::reduce::cols<unsigned short, int, unsigned short>,
0/*::reduce::cols<unsigned short, int, short>*/,
::reduce::cols<unsigned short, int, int>,
::reduce::cols<unsigned short, float, float>,
::reduce::cols<unsigned short, double, double>
},
{
0/*::reduce::cols<short, int, unsigned char>*/,
0/*::reduce::cols<short, int, signed char>*/,
0/*::reduce::cols<short, int, unsigned short>*/,
::reduce::cols<short, int, short>,
::reduce::cols<short, int, int>,
::reduce::cols<short, float, float>,
::reduce::cols<short, double, double>
},
{
0/*::reduce::cols<int, int, unsigned char>*/,
0/*::reduce::cols<int, int, signed char>*/,
0/*::reduce::cols<int, int, unsigned short>*/,
0/*::reduce::cols<int, int, short>*/,
::reduce::cols<int, int, int>,
::reduce::cols<int, float, float>,
::reduce::cols<int, double, double>
},
{
0/*::reduce::cols<float, float, unsigned char>*/,
0/*::reduce::cols<float, float, signed char>*/,
0/*::reduce::cols<float, float, unsigned short>*/,
0/*::reduce::cols<float, float, short>*/,
0/*::reduce::cols<float, float, int>*/,
::reduce::cols<float, float, float>,
::reduce::cols<float, double, double>
},
{
0/*::reduce::cols<double, double, unsigned char>*/,
0/*::reduce::cols<double, double, signed char>*/,
0/*::reduce::cols<double, double, unsigned short>*/,
0/*::reduce::cols<double, double, short>*/,
0/*::reduce::cols<double, double, int>*/,
0/*::reduce::cols<double, double, float>*/,
::reduce::cols<double, double, double>
}
};
const func_t func = funcs[src.depth()][dst.depth()];
if (!func)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
}
}
////////////////////////////////////////////////////////////////////////
// meanStdDev
void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
{
GpuMat buf;
meanStdDev(src, mean, stddev, buf);
}
void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf)
{
CV_Assert(src.type() == CV_8UC1);
if (!deviceSupports(FEATURE_SET_COMPUTE_13))
CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
DeviceBuffer dbuf(2);
int bufSize;
#if (CUDA_VERSION <= 4020)
nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
#else
nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
#endif
ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
cudaSafeCall( cudaDeviceSynchronize() );
double* ptrs[2] = {mean.val, stddev.val};
dbuf.download(ptrs);
}
//////////////////////////////////////////////////////////////////////////////
// rectStdDev
void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
{
CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1);
dst.create(src.size(), CV_32FC1);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
NppiRect nppRect;
nppRect.height = rect.height;
nppRect.width = rect.width;
nppRect.x = rect.x;
nppRect.y = rect.y;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
// normalize
void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
{
GpuMat norm_buf;
GpuMat cvt_buf;
normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
}
void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
{
double scale = 1, shift = 0;
if (norm_type == NORM_MINMAX)
{
double smin = 0, smax = 0;
double dmin = std::min(a, b), dmax = std::max(a, b);
gpu::minMax(src, &smin, &smax, mask, norm_buf);
scale = (dmax - dmin) * (smax - smin > std::numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
shift = dmin - smin * scale;
}
else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
{
scale = gpu::norm(src, norm_type, mask, norm_buf);
scale = scale > std::numeric_limits<double>::epsilon() ? a / scale : 0.0;
shift = 0;
}
else
{
CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type");
}
if (mask.empty())
{
src.convertTo(dst, dtype, scale, shift);
}
else
{
src.convertTo(cvt_buf, dtype, scale, shift);
cvt_buf.copyTo(dst, mask);
}
}
#endif