renamed gpuarithm -> cudaarithm

This commit is contained in:
Vladislav Vinogradov
2013-07-23 15:24:10 +04:00
parent 6d216d78d9
commit dcd600cc47
101 changed files with 72 additions and 72 deletions

View File

@@ -0,0 +1,147 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
struct VAbsDiff4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff4(a, b);
}
__host__ __device__ __forceinline__ VAbsDiff4() {}
__host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
};
struct VAbsDiff2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vabsdiff2(a, b);
}
__host__ __device__ __forceinline__ VAbsDiff2() {}
__host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
};
__device__ __forceinline__ int _abs(int a)
{
return ::abs(a);
}
__device__ __forceinline__ float _abs(float a)
{
return ::fabsf(a);
}
__device__ __forceinline__ double _abs(double a)
{
return ::fabs(a);
}
template <typename T> struct AbsDiffMat : binary_function<T, T, T>
{
__device__ __forceinline__ T operator ()(T a, T b) const
{
return saturate_cast<T>(_abs(a - b));
}
__host__ __device__ __forceinline__ AbsDiffMat() {}
__host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
}
void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
}
template <typename T>
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
}
template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,98 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
{
S val;
__host__ explicit AbsDiffScalar(S val_) : val(val_) {}
__device__ __forceinline__ T operator ()(T a) const
{
abs_func<S> f;
return saturate_cast<T>(f(a - val));
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T, typename S>
void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
{
AbsDiffScalar<T, S> op(static_cast<S>(val));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
}
template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,185 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
struct VAdd4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vadd4(a, b);
}
__host__ __device__ __forceinline__ VAdd4() {}
__host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
};
struct VAdd2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vadd2(a, b);
}
__host__ __device__ __forceinline__ VAdd2() {}
__host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
};
template <typename T, typename D> struct AddMat : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(a + b);
}
__host__ __device__ __forceinline__ AddMat() {}
__host__ __device__ __forceinline__ AddMat(const AddMat&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::AddMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
}
void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
}
template <typename T, typename D>
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
}
template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,148 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
{
S val;
__host__ explicit AddScalar(S val_) : val(val_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(a + val);
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::AddScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
AddScalar<T, S, D> op(static_cast<S>(val));
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,364 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
template <typename T> struct UseDouble_
{
enum {value = 0};
};
template <> struct UseDouble_<double>
{
enum {value = 1};
};
template <typename T1, typename T2, typename D> struct UseDouble
{
enum {value = (UseDouble_<T1>::value || UseDouble_<T2>::value || UseDouble_<D>::value)};
};
template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted_;
template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, false> : binary_function<T1, T2, D>
{
float alpha;
float beta;
float gamma;
__host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
__device__ __forceinline__ D operator ()(T1 a, T2 b) const
{
return saturate_cast<D>(a * alpha + b * beta + gamma);
}
};
template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, true> : binary_function<T1, T2, D>
{
double alpha;
double beta;
double gamma;
__host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
__device__ __forceinline__ D operator ()(T1 a, T2 b) const
{
return saturate_cast<D>(a * alpha + b * beta + gamma);
}
};
template <typename T1, typename T2, typename D> struct AddWeighted : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>
{
AddWeighted(double alpha_, double beta_, double gamma_) : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T1, typename T2, typename D, size_t src1_size, size_t src2_size, size_t dst_size> struct AddWeightedTraits : DefaultTransformFunctorTraits< arithm::AddWeighted<T1, T2, D> >
{
};
template <typename T1, typename T2, typename D, size_t src_size, size_t dst_size> struct AddWeightedTraits<T1, T2, D, src_size, src_size, dst_size> : arithm::ArithmFuncTraits<src_size, dst_size>
{
};
template <typename T1, typename T2, typename D> struct TransformFunctorTraits< arithm::AddWeighted<T1, T2, D> > : AddWeightedTraits<T1, T2, D, sizeof(T1), sizeof(T2), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T1, typename T2, typename D>
void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream)
{
AddWeighted<T1, T2, D> op(alpha, beta, gamma);
device::transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, uchar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,145 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __ARITHM_FUNC_TRAITS_HPP__
#define __ARITHM_FUNC_TRAITS_HPP__
#include <cstddef>
namespace arithm
{
template <size_t src_size, size_t dst_size> struct ArithmFuncTraits
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 1 };
};
template <> struct ArithmFuncTraits<1, 1>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<1, 2>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<1, 4>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<2, 1>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<2, 2>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<2, 4>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<4, 1>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<4, 2>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
template <> struct ArithmFuncTraits<4, 4>
{
enum { simple_block_dim_x = 32 };
enum { simple_block_dim_y = 8 };
enum { smart_block_dim_x = 32 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
}
#endif // __ARITHM_FUNC_TRAITS_HPP__

View File

@@ -0,0 +1,126 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< bit_not<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< bit_and<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< bit_or<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< bit_xor<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
else
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
}
template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
}
template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
}
template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
}
template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,104 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
}
template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
}
template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
}
template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,206 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
struct VCmpEq4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmpeq4(a, b);
}
__host__ __device__ __forceinline__ VCmpEq4() {}
__host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
};
struct VCmpNe4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmpne4(a, b);
}
__host__ __device__ __forceinline__ VCmpNe4() {}
__host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
};
struct VCmpLt4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmplt4(a, b);
}
__host__ __device__ __forceinline__ VCmpLt4() {}
__host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
};
struct VCmpLe4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmple4(a, b);
}
__host__ __device__ __forceinline__ VCmpLe4() {}
__host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
};
template <class Op, typename T>
struct Cmp : binary_function<T, T, uchar>
{
__device__ __forceinline__ uchar operator()(T a, T b) const
{
Op op;
return -op(a, b);
}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
{
};
}}}
namespace arithm
{
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
}
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
}
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
}
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
}
template <template <typename> class Op, typename T>
void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
Cmp<Op<T>, T> op;
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
}
template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cmpMat<equal_to, T>(src1, src2, dst, stream);
}
template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cmpMat<not_equal_to, T>(src1, src2, dst, stream);
}
template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cmpMat<less, T>(src1, src2, dst, stream);
}
template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
cmpMat<less_equal, T>(src1, src2, dst, stream);
}
template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,284 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
template <class Op, typename T>
struct Cmp : binary_function<T, T, uchar>
{
__device__ __forceinline__ uchar operator()(T a, T b) const
{
Op op;
return -op(a, b);
}
};
#define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
template <class Op, typename T, int cn> struct CmpScalar;
template <class Op, typename T>
struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
{
T val;
__host__ explicit CmpScalar(T val_) : val(val_) {}
__device__ __forceinline__ uchar operator()(T src) const
{
Cmp<Op, T> op;
return op(src, val);
}
};
template <class Op, typename T>
struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
{
TYPE_VEC(T, 2) val;
__host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
{
Cmp<Op, T> op;
return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
}
};
template <class Op, typename T>
struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
{
TYPE_VEC(T, 3) val;
__host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
{
Cmp<Op, T> op;
return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
}
};
template <class Op, typename T>
struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
{
TYPE_VEC(T, 4) val;
__host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
{
Cmp<Op, T> op;
return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
}
};
#undef TYPE_VEC
}
namespace cv { namespace cuda { namespace device
{
template <class Op, typename T> struct TransformFunctorTraits< arithm::CmpScalar<Op, T, 1> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
{
};
}}}
namespace arithm
{
template <template <typename> class Op, typename T, int cn>
void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type src_t;
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
src_t val1 = VecTraits<src_t>::make(sval);
CmpScalar<Op<T>, T, cn> op(val1);
device::transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
}
template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<equal_to, T, 1>,
cmpScalar<equal_to, T, 2>,
cmpScalar<equal_to, T, 3>,
cmpScalar<equal_to, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<not_equal_to, T, 1>,
cmpScalar<not_equal_to, T, 2>,
cmpScalar<not_equal_to, T, 3>,
cmpScalar<not_equal_to, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<less, T, 1>,
cmpScalar<less, T, 2>,
cmpScalar<less, T, 3>,
cmpScalar<less, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<less_equal, T, 1>,
cmpScalar<less_equal, T, 2>,
cmpScalar<less_equal, T, 3>,
cmpScalar<less_equal, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<greater, T, 1>,
cmpScalar<greater, T, 2>,
cmpScalar<greater, T, 3>,
cmpScalar<greater, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
cmpScalar<greater_equal, T, 1>,
cmpScalar<greater_equal, T, 2>,
cmpScalar<greater_equal, T, 3>,
cmpScalar<greater_equal, T, 4>
};
funcs[cn](src, val, dst, stream);
}
template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,131 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
namespace cv { namespace cuda { namespace device
{
namespace imgproc
{
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = src(y - top, x - left);
}
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
{
static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
const T* borderValue, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type vec_type;
typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
static const caller_t callers[5] =
{
CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
CopyMakeBorderDispatcher<BrdWrap, vec_type>::call,
CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call
};
callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
}
template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,176 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace countNonZero
{
__device__ unsigned int blocks_finished = 0;
template <int BLOCK_SIZE, typename T>
__global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
{
__shared__ unsigned int scount[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int mycount = 0;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
const T srcVal = ptr[x];
mycount += (srcVal != 0);
}
}
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
#if __CUDA_ARCH__ >= 200
if (tid == 0)
::atomicAdd(count, mycount);
#else
__shared__ bool is_last;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
if (tid == 0)
{
count[bid] = mycount;
__threadfence();
unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
if (tid == 0)
{
count[0] = mycount;
blocks_finished = 0;
}
}
#endif
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
}
template <typename T>
int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
{
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
unsigned int* count_buf = buf.ptr(0);
cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
return count;
}
template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<int >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,230 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
struct Div_8uc4_32f : binary_function<uint, float, uint>
{
__device__ __forceinline__ uint operator ()(uint a, float b) const
{
uint res = 0;
if (b != 0)
{
b = 1.0f / b;
res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
}
return res;
}
};
struct Div_16sc4_32f : binary_function<short4, float, short4>
{
__device__ __forceinline__ short4 operator ()(short4 a, float b) const
{
return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<short>(a.y / b),
saturate_cast<short>(a.z / b), saturate_cast<short>(a.w / b))
: make_short4(0,0,0,0);
}
};
template <typename T, typename D> struct Div : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return b != 0 ? saturate_cast<D>(a / b) : 0;
}
__host__ __device__ __forceinline__ Div() {}
__host__ __device__ __forceinline__ Div(const Div&) {}
};
template <typename T> struct Div<T, float> : binary_function<T, T, float>
{
__device__ __forceinline__ float operator ()(T a, T b) const
{
return b != 0 ? static_cast<float>(a) / b : 0;
}
__host__ __device__ __forceinline__ Div() {}
__host__ __device__ __forceinline__ Div(const Div&) {}
};
template <typename T> struct Div<T, double> : binary_function<T, T, double>
{
__device__ __forceinline__ double operator ()(T a, T b) const
{
return b != 0 ? static_cast<double>(a) / b : 0;
}
__host__ __device__ __forceinline__ Div() {}
__host__ __device__ __forceinline__ Div(const Div&) {}
};
template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
{
S scale;
__host__ explicit DivScale(S scale_) : scale(scale_) {}
__device__ __forceinline__ D operator ()(T a, T b) const
{
return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits<arithm::Div_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::Div<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
}
void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
}
template <typename T, typename S, typename D>
void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
{
if (scale == 1)
{
Div<T, D> op;
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
else
{
DivScale<T, S, D> op(static_cast<S>(scale));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
}
template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,168 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
template <typename T, typename S, typename D> struct DivScalar : unary_function<T, D>
{
S val;
__host__ explicit DivScalar(S val_) : val(val_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(a / val);
}
};
template <typename T, typename S, typename D> struct DivScalarInv : unary_function<T, D>
{
S val;
explicit DivScalarInv(S val_) : val(val_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return a != 0 ? saturate_cast<D>(val / a) : 0;
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalarInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream)
{
if (inv)
{
DivScalarInv<T, S, D> op(static_cast<S>(val));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
else
{
DivScalar<T, S, D> op(static_cast<S>(val));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
}
template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
//template void divScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
template void divScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,472 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace cuda { namespace device
{
namespace imgproc
{
// Utility function to extract unsigned chars from an unsigned integer
__device__ uchar4 int_to_uchar4(unsigned int in)
{
uchar4 bytes;
bytes.x = (in & 0x000000ff) >> 0;
bytes.y = (in & 0x0000ff00) >> 8;
bytes.z = (in & 0x00ff0000) >> 16;
bytes.w = (in & 0xff000000) >> 24;
return bytes;
}
__global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
__shared__ int sums[128];
const int id = threadIdx.x;
const int lane_id = id % warpSize;
const int warp_id = id / warpSize;
const uint4 data = img(blockIdx.x, id);
const uchar4 a = int_to_uchar4(data.x);
const uchar4 b = int_to_uchar4(data.y);
const uchar4 c = int_to_uchar4(data.z);
const uchar4 d = int_to_uchar4(data.w);
int result[16];
result[0] = a.x;
result[1] = result[0] + a.y;
result[2] = result[1] + a.z;
result[3] = result[2] + a.w;
result[4] = result[3] + b.x;
result[5] = result[4] + b.y;
result[6] = result[5] + b.z;
result[7] = result[6] + b.w;
result[8] = result[7] + c.x;
result[9] = result[8] + c.y;
result[10] = result[9] + c.z;
result[11] = result[10] + c.w;
result[12] = result[11] + d.x;
result[13] = result[12] + d.y;
result[14] = result[13] + d.z;
result[15] = result[14] + d.w;
int sum = result[15];
// the prefix sum for each thread's 16 value is computed,
// now the final sums (result[15]) need to be shared
// with the other threads and add. To do this,
// the __shfl_up() instruction is used and a shuffle scan
// operation is performed to distribute the sums to the correct
// threads
#pragma unroll
for (int i = 1; i < 32; i *= 2)
{
const int n = __shfl_up(sum, i, 32);
if (lane_id >= i)
{
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] += n;
sum += n;
}
}
// Now the final sum for the warp must be shared
// between warps. This is done by each warp
// having a thread store to shared memory, then
// having some other warp load the values and
// compute a prefix sum, again by using __shfl_up.
// The results are uniformly added back to the warps.
// last thread in the warp holding sum of the warp
// places that in shared
if (threadIdx.x % warpSize == warpSize - 1)
sums[warp_id] = result[15];
__syncthreads();
if (warp_id == 0)
{
int warp_sum = sums[lane_id];
#pragma unroll
for (int i = 1; i <= 32; i *= 2)
{
const int n = __shfl_up(warp_sum, i, 32);
if (lane_id >= i)
warp_sum += n;
}
sums[lane_id] = warp_sum;
}
__syncthreads();
int blockSum = 0;
// fold in unused warp
if (warp_id > 0)
{
blockSum = sums[warp_id - 1];
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] += blockSum;
}
// assemble result
// Each thread has 16 values to write, which are
// now integer data (to avoid overflow). Instead of
// each thread writing consecutive uint4s, the
// approach shown here experiments using
// the shuffle command to reformat the data
// inside the registers so that each thread holds
// consecutive data to be written so larger contiguous
// segments can be assembled for writing.
/*
For example data that needs to be written as
GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
but is stored in registers (r0..r3), in four threads (0..3) as:
threadId 0 1 2 3
r0 x0 y0 z0 w0
r1 x1 y1 z1 w1
r2 x2 y2 z2 w2
r3 x3 y3 z3 w3
after apply __shfl_xor operations to move data between registers r1..r3:
threadId 00 01 10 11
x0 y0 z0 w0
xor(01)->y1 x1 w1 z1
xor(10)->z2 w2 x2 y2
xor(11)->w3 z3 y3 x3
and now x0..x3, and z0..z3 can be written out in order by all threads.
In the current code, each register above is actually representing
four integers to be written as uint4's to GMEM.
*/
result[4] = __shfl_xor(result[4] , 1, 32);
result[5] = __shfl_xor(result[5] , 1, 32);
result[6] = __shfl_xor(result[6] , 1, 32);
result[7] = __shfl_xor(result[7] , 1, 32);
result[8] = __shfl_xor(result[8] , 2, 32);
result[9] = __shfl_xor(result[9] , 2, 32);
result[10] = __shfl_xor(result[10], 2, 32);
result[11] = __shfl_xor(result[11], 2, 32);
result[12] = __shfl_xor(result[12], 3, 32);
result[13] = __shfl_xor(result[13], 3, 32);
result[14] = __shfl_xor(result[14], 3, 32);
result[15] = __shfl_xor(result[15], 3, 32);
uint4* integral_row = integral.ptr(blockIdx.x);
uint4 output;
///////
if (threadIdx.x % 4 == 0)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 2)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
///////
if (threadIdx.x % 4 == 2)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
// continuning from the above example,
// this use of __shfl_xor() places the y0..y3 and w0..w3 data
// in order.
#pragma unroll
for (int i = 0; i < 16; ++i)
result[i] = __shfl_xor(result[i], 1, 32);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 2)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
///////
if (threadIdx.x % 4 == 2)
output = make_uint4(result[0], result[1], result[2], result[3]);
if (threadIdx.x % 4 == 3)
output = make_uint4(result[4], result[5], result[6], result[7]);
if (threadIdx.x % 4 == 0)
output = make_uint4(result[8], result[9], result[10], result[11]);
if (threadIdx.x % 4 == 1)
output = make_uint4(result[12], result[13], result[14], result[15]);
integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
#endif
}
// This kernel computes columnwise prefix sums. When the data input is
// the row sums from above, this completes the integral image.
// The approach here is to have each block compute a local set of sums.
// First , the data covered by the block is loaded into shared memory,
// then instead of performing a sum in shared memory using __syncthreads
// between stages, the data is reformatted so that the necessary sums
// occur inside warps and the shuffle scan operation is used.
// The final set of sums from the block is then propgated, with the block
// computing "down" the image and adding the running sum to the local
// block sums.
__global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
__shared__ unsigned int sums[32][9];
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const int lane_id = tidx % 8;
if (tidx >= integral.cols)
return;
sums[threadIdx.x][threadIdx.y] = 0;
__syncthreads();
unsigned int stepSum = 0;
for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
{
unsigned int* p = integral.ptr(y) + tidx;
unsigned int sum = *p;
sums[threadIdx.x][threadIdx.y] = sum;
__syncthreads();
// place into SMEM
// shfl scan reduce the SMEM, reformating so the column
// sums are computed in a warp
// then read out properly
const int j = threadIdx.x % 8;
const int k = threadIdx.x / 8 + threadIdx.y * 4;
int partial_sum = sums[k][j];
for (int i = 1; i <= 8; i *= 2)
{
int n = __shfl_up(partial_sum, i, 32);
if (lane_id >= i)
partial_sum += n;
}
sums[k][j] = partial_sum;
__syncthreads();
if (threadIdx.y > 0)
sum += sums[threadIdx.x][threadIdx.y - 1];
sum += stepSum;
stepSum += sums[threadIdx.x][blockDim.y - 1];
__syncthreads();
*p = sum;
}
#endif
}
void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
{
{
// each thread handles 16 values, use 1 block/row
// save, becouse step is actually can't be less 512 bytes
int block = integral.cols / 16;
// launch 1 block / row
const int grid = img.rows;
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
cudaSafeCall( cudaGetLastError() );
}
{
const dim3 block(32, 8);
const dim3 grid(divUp(integral.cols, block.x), 1);
shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
__shared__ unsigned int sums[32][9];
const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
const int lane_id = tidx % 8;
if (tidx >= integral.cols)
return;
sums[threadIdx.x][threadIdx.y] = 0;
__syncthreads();
unsigned int stepSum = 0;
for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
{
unsigned int* p = buffer.ptr(y) + tidx;
unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
unsigned int sum = *p;
sums[threadIdx.x][threadIdx.y] = sum;
__syncthreads();
// place into SMEM
// shfl scan reduce the SMEM, reformating so the column
// sums are computed in a warp
// then read out properly
const int j = threadIdx.x % 8;
const int k = threadIdx.x / 8 + threadIdx.y * 4;
int partial_sum = sums[k][j];
for (int i = 1; i <= 8; i *= 2)
{
int n = __shfl_up(partial_sum, i, 32);
if (lane_id >= i)
partial_sum += n;
}
sums[k][j] = partial_sum;
__syncthreads();
if (threadIdx.y > 0)
sum += sums[threadIdx.x][threadIdx.y - 1];
sum += stepSum;
stepSum += sums[threadIdx.x][blockDim.y - 1];
__syncthreads();
*dst = sum;
}
#endif
}
// used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
int blockStep, cudaStream_t stream)
{
{
const int block = blockStep;
const int grid = img.rows;
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
cudaSafeCall( cudaGetLastError() );
}
{
const dim3 block(32, 8);
const dim3 grid(divUp(integral.cols, block.x), 1);
shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
cudaSafeCall( cudaGetLastError() );
}
}
}
}}}
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,302 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/type_traits.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
//////////////////////////////////////////////////////////////////////////
// absMat
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< abs_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
}
template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// sqrMat
namespace arithm
{
template <typename T> struct Sqr : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
return saturate_cast<T>(x * x);
}
__host__ __device__ __forceinline__ Sqr() {}
__host__ __device__ __forceinline__ Sqr(const Sqr&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< arithm::Sqr<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
}
template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// sqrtMat
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< sqrt_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
}
template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// logMat
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< log_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
}
template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// expMat
namespace arithm
{
template <typename T> struct Exp : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
exp_func<T> f;
return saturate_cast<T>(f(x));
}
__host__ __device__ __forceinline__ Exp() {}
__host__ __device__ __forceinline__ Exp(const Exp&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< arithm::Exp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <typename T>
void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
}
template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// pow
namespace arithm
{
template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
{
float power;
__host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
__device__ __forceinline__ T operator()(T e) const
{
return saturate_cast<T>(__powf((float)e, power));
}
};
template<typename T> struct PowOp<T, true> : unary_function<T, T>
{
float power;
__host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
__device__ __forceinline__ T operator()(T e) const
{
T res = saturate_cast<T>(__powf((float)e, power));
if ((e < 0) && (1 & static_cast<int>(power)))
res *= -1;
return res;
}
};
template<> struct PowOp<float> : unary_function<float, float>
{
float power;
__host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
__device__ __forceinline__ float operator()(float e) const
{
return __powf(::fabs(e), power);
}
};
template<> struct PowOp<double> : unary_function<double, double>
{
double power;
__host__ explicit PowOp(double power_) : power(power_) {}
__device__ __forceinline__ double operator()(double e) const
{
return ::pow(::fabs(e), power);
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< arithm::PowOp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template<typename T>
void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
}
template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,247 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/utility.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace minMax
{
__device__ unsigned int blocks_finished = 0;
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits;
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
template <int BLOCK_SIZE, typename R>
struct GlobalReduce
{
static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
{
#if __CUDA_ARCH__ >= 200
if (tid == 0)
{
Emulation::glob::atomicMin(minval, mymin);
Emulation::glob::atomicMax(maxval, mymax);
}
#else
__shared__ bool is_last;
if (tid == 0)
{
minval[bid] = mymin;
maxval[bid] = mymax;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
int idx = ::min(tid, gridDim.x * gridDim.y - 1);
mymin = minval[idx];
mymax = maxval[idx];
const minimum<R> minOp;
const maximum<R> maxOp;
device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
if (tid == 0)
{
minval[0] = mymin;
maxval[0] = mymax;
blocks_finished = 0;
}
}
#endif
}
};
template <int BLOCK_SIZE, typename T, typename R, class Mask>
__global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
{
__shared__ R sminval[BLOCK_SIZE];
__shared__ R smaxval[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
R mymin = numeric_limits<R>::max();
R mymax = -numeric_limits<R>::max();
const minimum<R> minOp;
const maximum<R> maxOp;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const R srcVal = ptr[x];
mymin = minOp(mymin, srcVal);
mymax = maxOp(mymax, srcVal);
}
}
}
device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
bufcols = grid.x * grid.y * sizeof(double);
bufrows = 2;
}
__global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
{
*minval_buf = numeric_limits<int>::max();
*maxval_buf = numeric_limits<int>::min();
}
__global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
{
*minval_buf = numeric_limits<float>::max();
*maxval_buf = -numeric_limits<float>::max();
}
__global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
{
*minval_buf = numeric_limits<double>::max();
*maxval_buf = -numeric_limits<double>::max();
}
template <typename R>
void setDefault(R* minval_buf, R* maxval_buf)
{
setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
}
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
{
typedef typename MinMaxTypeTraits<T>::best_type R;
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
R* minval_buf = (R*) buf.ptr(0);
R* maxval_buf = (R*) buf.ptr(1);
setDefault(minval_buf, maxval_buf);
if (mask.data)
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
else
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
R minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
}
template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,228 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
//////////////////////////////////////////////////////////////////////////
// min
namespace arithm
{
struct VMin4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmin4(a, b);
}
__host__ __device__ __forceinline__ VMin4() {}
__host__ __device__ __forceinline__ VMin4(const VMin4&) {}
};
struct VMin2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmin2(a, b);
}
__host__ __device__ __forceinline__ VMin2() {}
__host__ __device__ __forceinline__ VMin2(const VMin2&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T> struct TransformFunctorTraits< minimum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
}
void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
}
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
}
template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
}
template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// max
namespace arithm
{
struct VMax4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmax4(a, b);
}
__host__ __device__ __forceinline__ VMax4() {}
__host__ __device__ __forceinline__ VMax4(const VMax4&) {}
};
struct VMax2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vmax2(a, b);
}
__host__ __device__ __forceinline__ VMax2() {}
__host__ __device__ __forceinline__ VMax2(const VMax2&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T> struct TransformFunctorTraits< maximum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
}
void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
}
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
}
template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
{
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
}
template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,236 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/utility.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace minMaxLoc
{
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits;
template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
template <int BLOCK_SIZE, typename T, class Mask>
__global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
__shared__ work_type sminval[BLOCK_SIZE];
__shared__ work_type smaxval[BLOCK_SIZE];
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
work_type mymin = numeric_limits<work_type>::max();
work_type mymax = -numeric_limits<work_type>::max();
unsigned int myminloc = 0;
unsigned int mymaxloc = 0;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const work_type srcVal = ptr[x];
if (srcVal < mymin)
{
mymin = srcVal;
myminloc = y * src.cols + x;
}
if (srcVal > mymax)
{
mymax = srcVal;
mymaxloc = y * src.cols + x;
}
}
}
}
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
tid,
thrust::make_tuple(less<work_type>(), greater<work_type>()));
if (tid == 0)
{
minval[bid] = (T) mymin;
maxval[bid] = (T) mymax;
minloc[bid] = myminloc;
maxloc[bid] = mymaxloc;
}
}
template <int BLOCK_SIZE, typename T>
__global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
__shared__ work_type sminval[BLOCK_SIZE];
__shared__ work_type smaxval[BLOCK_SIZE];
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
unsigned int idx = ::min(threadIdx.x, count - 1);
work_type mymin = minval[idx];
work_type mymax = maxval[idx];
unsigned int myminloc = minloc[idx];
unsigned int mymaxloc = maxloc[idx];
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
threadIdx.x,
thrust::make_tuple(less<work_type>(), greater<work_type>()));
if (threadIdx.x == 0)
{
minval[0] = (T) mymin;
maxval[0] = (T) mymax;
minloc[0] = myminloc;
maxloc[0] = mymaxloc;
}
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
// For values
b1cols = (int)(grid.x * grid.y * elem_size);
b1rows = 2;
// For locations
b2cols = grid.x * grid.y * sizeof(int);
b2rows = 2;
}
template <typename T>
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
{
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
T* minval_buf = (T*) valbuf.ptr(0);
T* maxval_buf = (T*) valbuf.ptr(1);
unsigned int* minloc_buf = locbuf.ptr(0);
unsigned int* maxloc_buf = locbuf.ptr(1);
if (mask.data)
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
else
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
unsigned int minloc_, maxloc_;
cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,211 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
struct Mul_8uc4_32f : binary_function<uint, float, uint>
{
__device__ __forceinline__ uint operator ()(uint a, float b) const
{
uint res = 0;
res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
return res;
}
__host__ __device__ __forceinline__ Mul_8uc4_32f() {}
__host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
};
struct Mul_16sc4_32f : binary_function<short4, float, short4>
{
__device__ __forceinline__ short4 operator ()(short4 a, float b) const
{
return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
}
__host__ __device__ __forceinline__ Mul_16sc4_32f() {}
__host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
};
template <typename T, typename D> struct Mul : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(a * b);
}
__host__ __device__ __forceinline__ Mul() {}
__host__ __device__ __forceinline__ Mul(const Mul&) {}
};
template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
{
S scale;
__host__ explicit MulScale(S scale_) : scale(scale_) {}
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(scale * a * b);
}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits<arithm::Mul_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::Mul<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
}
void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
}
template <typename T, typename S, typename D>
void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
{
if (scale == 1)
{
Mul<T, D> op;
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
else
{
MulScale<T, S, D> op(static_cast<S>(scale));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
}
template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
//template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,144 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
template <typename T, typename S, typename D> struct MulScalar : unary_function<T, D>
{
S val;
__host__ explicit MulScalar(S val_) : val(val_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(a * val);
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
{
MulScalar<T, S, D> op(static_cast<S>(val));
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
//template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,171 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "cvconfig.h"
#ifdef HAVE_CUFFT
#include <cufft.h>
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace cuda { namespace device
{
//////////////////////////////////////////////////////////////////////////
// mulSpectrums
__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows)
{
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
}
}
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
// mulSpectrums_CONJ
__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows)
{
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
}
}
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums
__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows)
{
cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
}
}
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
if (stream)
cudaSafeCall( cudaDeviceSynchronize() );
}
//////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums_CONJ
__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows)
{
cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
}
}
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}}} // namespace cv { namespace cuda { namespace cudev
#endif // HAVE_CUFFT
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,217 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace cuda { namespace device
{
namespace mathfunc
{
//////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar
struct Nothing
{
static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
{
}
};
struct Magnitude
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
}
};
struct MagnitudeSqr
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{
dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
}
};
struct Atan2
{
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
{
float angle = ::atan2f(y_data, x_data);
angle += (angle < 0) * 2.0f * CV_PI_F;
dst[y * dst_step + x] = scale * angle;
}
};
template <typename Mag, typename Angle>
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
float x_data = xptr[y * x_step + x];
float y_data = yptr[y * y_step + x];
Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
}
}
struct NonEmptyMag
{
static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
{
return mag[y * mag_step + x];
}
};
struct EmptyMag
{
static __device__ __forceinline__ float get(const float*, size_t, int, int)
{
return 1.0f;
}
};
template <typename Mag>
__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
float mag_data = Mag::get(mag, mag_step, x, y);
float angle_data = angle[y * angle_step + x];
float sin_a, cos_a;
::sincosf(scale * angle_data, &sin_a, &cos_a);
xptr[y * x_step + x] = mag_data * cos_a;
yptr[y * y_step + x] = mag_data * sin_a;
}
}
template <typename Mag, typename Angle>
void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(x.cols, threads.x);
grid.y = divUp(x.rows, threads.y);
const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2][2][2] =
{
{
{
cartToPolar_caller<Magnitude, Atan2>,
cartToPolar_caller<Magnitude, Nothing>
},
{
cartToPolar_caller<MagnitudeSqr, Atan2>,
cartToPolar_caller<MagnitudeSqr, Nothing>,
}
},
{
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>
},
{
cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<Nothing, Nothing>,
}
}
};
callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
}
template <typename Mag>
void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(mag.cols, threads.x);
grid.y = divUp(mag.rows, threads.y);
const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] =
{
polarToCart_caller<NonEmptyMag>,
polarToCart_caller<EmptyMag>
};
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
}
} // namespace mathfunc
}}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,340 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "unroll_detail.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace reduce
{
struct Sum
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(0);
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
return a + b;
}
template <typename T>
__device__ __forceinline__ T result(T r, int) const
{
return r;
}
__host__ __device__ __forceinline__ Sum() {}
__host__ __device__ __forceinline__ Sum(const Sum&) {}
};
template <typename T> struct OutputType
{
typedef float type;
};
template <> struct OutputType<double>
{
typedef double type;
};
struct Avg
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(0);
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
return a + b;
}
template <typename T>
__device__ __forceinline__ typename TypeVec<typename OutputType<typename VecTraits<T>::elem_type>::type, VecTraits<T>::cn>::vec_type result(T r, float sz) const
{
return r / sz;
}
__host__ __device__ __forceinline__ Avg() {}
__host__ __device__ __forceinline__ Avg(const Avg&) {}
};
struct Min
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
minimum<T> minOp;
return minOp(a, b);
}
template <typename T>
__device__ __forceinline__ T result(T r, int) const
{
return r;
}
__host__ __device__ __forceinline__ Min() {}
__host__ __device__ __forceinline__ Min(const Min&) {}
};
struct Max
{
template <typename T>
__device__ __forceinline__ T startValue() const
{
return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
}
template <typename T>
__device__ __forceinline__ T operator ()(T a, T b) const
{
maximum<T> maxOp;
return maxOp(a, b);
}
template <typename T>
__device__ __forceinline__ T result(T r, int) const
{
return r;
}
__host__ __device__ __forceinline__ Max() {}
__host__ __device__ __forceinline__ Max(const Max&) {}
};
///////////////////////////////////////////////////////////
template <typename T, typename S, typename D, class Op>
__global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
{
__shared__ S smem[16 * 16];
const int x = blockIdx.x * 16 + threadIdx.x;
S myVal = op.template startValue<S>();
if (x < src.cols)
{
for (int y = threadIdx.y; y < src.rows; y += 16)
{
S srcVal = src(y, x);
myVal = op(myVal, srcVal);
}
}
smem[threadIdx.x * 16 + threadIdx.y] = myVal;
__syncthreads();
volatile S* srow = smem + threadIdx.y * 16;
myVal = srow[threadIdx.x];
device::reduce<16>(srow, myVal, threadIdx.x, op);
if (threadIdx.x == 0)
srow[0] = myVal;
__syncthreads();
if (threadIdx.y == 0 && x < src.cols)
dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
}
template <typename T, typename S, typename D, class Op>
void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(src.cols, block.x));
Op op;
rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename S, typename D>
void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
static const func_t funcs[] =
{
rowsCaller<T, S, D, Sum>,
rowsCaller<T, S, D, Avg>,
rowsCaller<T, S, D, Max>,
rowsCaller<T, S, D, Min>
};
funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
}
template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
///////////////////////////////////////////////////////////
template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
__global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
{
typedef typename TypeVec<T, cn>::vec_type src_type;
typedef typename TypeVec<S, cn>::vec_type work_type;
typedef typename TypeVec<D, cn>::vec_type dst_type;
__shared__ S smem[BLOCK_SIZE * cn];
const int y = blockIdx.x;
const src_type* srcRow = src.ptr(y);
work_type myVal = op.template startValue<work_type>();
for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
if (threadIdx.x == 0)
dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
}
template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
{
const int BLOCK_SIZE = 256;
const dim3 block(BLOCK_SIZE);
const dim3 grid(src.rows);
Op op;
colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
static const func_t funcs[5][4] =
{
{0,0,0,0},
{colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
{colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
{colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
{colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
};
funcs[cn][op](src, dst, stream);
}
template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,511 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
namespace cv { namespace cuda { namespace device
{
namespace split_merge
{
template <typename T, size_t elem_size = sizeof(T)>
struct TypeTraits
{
typedef T type;
typedef T type2;
typedef T type3;
typedef T type4;
};
template <typename T>
struct TypeTraits<T, 1>
{
typedef char type;
typedef char2 type2;
typedef char3 type3;
typedef char4 type4;
};
template <typename T>
struct TypeTraits<T, 2>
{
typedef short type;
typedef short2 type2;
typedef short3 type3;
typedef short4 type4;
};
template <typename T>
struct TypeTraits<T, 4>
{
typedef int type;
typedef int2 type2;
typedef int3 type3;
typedef int4 type4;
};
template <typename T>
struct TypeTraits<T, 8>
{
typedef double type;
typedef double2 type2;
//typedef double3 type3;
//typedef double4 type3;
};
typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);
//------------------------------------------------------------
// Merge
template <typename T>
__global__ void mergeC2_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type2 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_y[x] = dst_elem;
}
}
template <typename T>
__global__ void mergeC3_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type3 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_y[x] = dst_elem;
}
}
template <>
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
double* dst_y = (double*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[3 * x] = src0_y[x];
dst_y[3 * x + 1] = src1_y[x];
dst_y[3 * x + 2] = src2_y[x];
}
}
template <typename T>
__global__ void mergeC4_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
typedef typename TypeTraits<T>::type4 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const T* src0_y = (const T*)(src0 + y * src0_step);
const T* src1_y = (const T*)(src1 + y * src1_step);
const T* src2_y = (const T*)(src2 + y * src2_step);
const T* src3_y = (const T*)(src3 + y * src3_step);
dst_type* dst_y = (dst_type*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_type dst_elem;
dst_elem.x = src0_y[x];
dst_elem.y = src1_y[x];
dst_elem.z = src2_y[x];
dst_elem.w = src3_y[x];
dst_y[x] = dst_elem;
}
}
template <>
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src0_y = (const double*)(src0 + y * src0_step);
const double* src1_y = (const double*)(src1 + y * src1_step);
const double* src2_y = (const double*)(src2 + y * src2_step);
const double* src3_y = (const double*)(src3 + y * src3_step);
double2* dst_y = (double2*)(dst + y * dst_step);
if (x < cols && y < rows)
{
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
}
}
template <typename T>
static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC2_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC3_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
mergeC4_<T><<<grid, block, 0, stream>>>(
src[0].data, src[0].step,
src[1].data, src[1].step,
src[2].data, src[2].step,
src[3].data, src[3].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void merge(const PtrStepSzb* src, PtrStepSzb& dst,
int total_channels, size_t elem_size,
const cudaStream_t& stream)
{
static MergeFunction merge_func_tbl[] =
{
mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
};
size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
MergeFunction merge_func = merge_func_tbl[merge_func_id];
if (merge_func == 0)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
merge_func(src, dst, stream);
}
//------------------------------------------------------------
// Split
template <typename T>
__global__ void splitC2_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step)
{
typedef typename TypeTraits<T>::type2 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
}
}
template <typename T>
__global__ void splitC3_(const uchar* src, size_t src_step,
int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
{
typedef typename TypeTraits<T>::type3 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
}
}
template <>
__global__ void splitC3_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double* src_y = (const double*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
if (x < cols && y < rows)
{
dst0_y[x] = src_y[3 * x];
dst1_y[x] = src_y[3 * x + 1];
dst2_y[x] = src_y[3 * x + 2];
}
}
template <typename T>
__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
{
typedef typename TypeTraits<T>::type4 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const src_type* src_y = (const src_type*)(src + y * src_step);
T* dst0_y = (T*)(dst0 + y * dst0_step);
T* dst1_y = (T*)(dst1 + y * dst1_step);
T* dst2_y = (T*)(dst2 + y * dst2_step);
T* dst3_y = (T*)(dst3 + y * dst3_step);
if (x < cols && y < rows)
{
src_type src_elem = src_y[x];
dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z;
dst3_y[x] = src_elem.w;
}
}
template <>
__global__ void splitC4_<double>(
const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const double2* src_y = (const double2*)(src + y * src_step);
double* dst0_y = (double*)(dst0 + y * dst0_step);
double* dst1_y = (double*)(dst1 + y * dst1_step);
double* dst2_y = (double*)(dst2 + y * dst2_step);
double* dst3_y = (double*)(dst3 + y * dst3_step);
if (x < cols && y < rows)
{
double2 src_elem1 = src_y[2 * x];
double2 src_elem2 = src_y[2 * x + 1];
dst0_y[x] = src_elem1.x;
dst1_y[x] = src_elem1.y;
dst2_y[x] = src_elem2.x;
dst3_y[x] = src_elem2.y;
}
}
template <typename T>
static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC2_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC3_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template <typename T>
static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
{
dim3 block(32, 8);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
splitC4_<T><<<grid, block, 0, stream>>>(
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step,
dst[3].data, dst[3].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
{
static SplitFunction split_func_tbl[] =
{
splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
};
size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
SplitFunction split_func = split_func_tbl[split_func_id];
if (split_func == 0)
CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
split_func(src, dst, stream);
}
} // namespace split_merge
}}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */

View File

@@ -0,0 +1,185 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
struct VSub4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vsub4(a, b);
}
__host__ __device__ __forceinline__ VSub4() {}
__host__ __device__ __forceinline__ VSub4(const VSub4&) {}
};
struct VSub2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vsub2(a, b);
}
__host__ __device__ __forceinline__ VSub2() {}
__host__ __device__ __forceinline__ VSub2(const VSub2&) {}
};
template <typename T, typename D> struct SubMat : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(a - b);
}
__host__ __device__ __forceinline__ SubMat() {}
__host__ __device__ __forceinline__ SubMat(const SubMat&) {}
};
}
namespace cv { namespace cuda { namespace device
{
template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <typename T, typename D> struct TransformFunctorTraits< arithm::SubMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
}
void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
device::transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
}
template <typename T, typename D>
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
}
template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,149 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
template <typename T, typename S, typename D> struct SubScalar : unary_function<T, D>
{
S val;
int scale;
__host__ SubScalar(S val_, int scale_) : val(val_), scale(scale_) {}
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(scale * (a - val));
}
};
}
namespace cv { namespace cuda { namespace device
{
template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::SubScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
{
};
}}}
namespace arithm
{
template <typename T, typename S, typename D>
void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
SubScalar<T, S, D> op(static_cast<S>(val), inv ? -1 : 1);
if (mask.data)
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
else
device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
//template void subScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
template void subScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,381 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/utility.hpp"
#include "unroll_detail.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace sum
{
__device__ unsigned int blocks_finished = 0;
template <typename R, int cn> struct AtomicAdd;
template <typename R> struct AtomicAdd<R, 1>
{
static __device__ void run(R* ptr, R val)
{
Emulation::glob::atomicAdd(ptr, val);
}
};
template <typename R> struct AtomicAdd<R, 2>
{
typedef typename TypeVec<R, 2>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
}
};
template <typename R> struct AtomicAdd<R, 3>
{
typedef typename TypeVec<R, 3>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
}
};
template <typename R> struct AtomicAdd<R, 4>
{
typedef typename TypeVec<R, 4>::vec_type val_type;
static __device__ void run(R* ptr, val_type val)
{
Emulation::glob::atomicAdd(ptr, val.x);
Emulation::glob::atomicAdd(ptr + 1, val.y);
Emulation::glob::atomicAdd(ptr + 2, val.z);
Emulation::glob::atomicAdd(ptr + 3, val.w);
}
};
template <int BLOCK_SIZE, typename R, int cn>
struct GlobalReduce
{
typedef typename TypeVec<R, cn>::vec_type result_type;
static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
{
#if __CUDA_ARCH__ >= 200
if (tid == 0)
AtomicAdd<R, cn>::run((R*) result, sum);
#else
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = sum;
__threadfence();
unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
if (tid == 0)
{
result[0] = sum;
blocks_finished = 0;
}
}
#endif
}
};
template <int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op>
__global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight)
{
typedef typename VecTraits<src_type>::elem_type T;
typedef typename VecTraits<result_type>::elem_type R;
const int cn = VecTraits<src_type>::cn;
__shared__ R smem[BLOCK_SIZE * cn];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
result_type sum = VecTraits<result_type>::all(0);
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const src_type* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const src_type srcVal = ptr[x];
sum = sum + op(saturate_cast<result_type>(srcVal));
}
}
}
device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
}
template <typename T, typename R, int cn, template <typename> class Op>
void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
{
typedef typename TypeVec<T, cn>::vec_type src_type;
typedef typename TypeVec<R, cn>::vec_type result_type;
PtrStepSz<src_type> src(src_);
result_type* buf = (result_type*) buf_;
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
Op<result_type> op;
if (mask.data)
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
else
kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0};
cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
out[0] = result[0];
out[1] = result[1];
out[2] = result[2];
out[3] = result[3];
}
template <typename T> struct SumType;
template <> struct SumType<uchar> { typedef unsigned int R; };
template <> struct SumType<schar> { typedef int R; };
template <> struct SumType<ushort> { typedef unsigned int R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename T, int cn>
void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, identity>(src, buf, out, mask);
}
template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T, int cn>
void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
typedef typename SumType<T>::R R;
caller<T, R, cn, abs_func>(src, buf, out, mask);
}
template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template <typename T> struct Sqr : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(T x) const
{
return x * x;
}
};
template <typename T, int cn>
void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
{
caller<T, double, cn, Sqr>(src, buf, out, mask);
}
template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,114 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/transform.hpp"
#include "opencv2/core/cuda/saturate_cast.hpp"
#include "opencv2/core/cuda/simd_functions.hpp"
#include "arithm_func_traits.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace cv { namespace cuda { namespace device
{
template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
{
};
}}}
namespace arithm
{
template <template <typename> class Op, typename T>
void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream)
{
Op<T> op(thresh, maxVal);
device::transform(src, dst, op, WithOutMask(), stream);
}
template <typename T>
void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream)
{
typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream);
static const caller_t callers[] =
{
threshold_caller<thresh_binary_func, T>,
threshold_caller<thresh_binary_inv_func, T>,
threshold_caller<thresh_trunc_func, T>,
threshold_caller<thresh_to_zero_func, T>,
threshold_caller<thresh_to_zero_inv_func, T>
};
callers[type]((PtrStepSz<T>) src, (PtrStepSz<T>) dst, static_cast<T>(thresh), static_cast<T>(maxVal), stream);
}
template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,122 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/core/cuda/common.hpp"
using namespace cv::cuda;
using namespace cv::cuda::device;
namespace arithm
{
const int TRANSPOSE_TILE_DIM = 16;
const int TRANSPOSE_BLOCK_ROWS = 16;
template <typename T>
__global__ void transposeKernel(const PtrStepSz<T> src, PtrStep<T> dst)
{
__shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
int blockIdx_x, blockIdx_y;
// do diagonal reordering
if (gridDim.x == gridDim.y)
{
blockIdx_y = blockIdx.x;
blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
}
else
{
int bid = blockIdx.x + gridDim.x * blockIdx.y;
blockIdx_y = bid % gridDim.y;
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
}
int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
if (xIndex < src.cols)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < src.rows)
{
tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex);
}
}
}
__syncthreads();
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
if (xIndex < src.rows)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < src.cols)
{
dst(yIndex + i, xIndex) = tile[threadIdx.x][threadIdx.y + i];
}
}
}
}
template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
{
const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
transposeKernel<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void transpose<int>(PtrStepSz<int> src, PtrStepSz<int> dst, cudaStream_t stream);
template void transpose<double>(PtrStepSz<double> src, PtrStepSz<double> dst, cudaStream_t stream);
}
#endif // CUDA_DISABLER

View File

@@ -0,0 +1,135 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __UNROLL_DETAIL_HPP__
#define __UNROLL_DETAIL_HPP__
#include <thrust/tuple.h>
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
namespace detail
{
template <int cn> struct Unroll;
template <> struct Unroll<1>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
{
return smem;
}
template <typename R>
static __device__ __forceinline__ R& tie(R& val)
{
return val;
}
template <class Op>
static __device__ __forceinline__ const Op& op(const Op& op)
{
return op;
}
};
template <> struct Unroll<2>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
{
return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE);
}
template <typename R>
static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
{
return thrust::tie(val.x, val.y);
}
template <class Op>
static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
{
return thrust::make_tuple(op, op);
}
};
template <> struct Unroll<3>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
{
return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
}
template <typename R>
static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
{
return thrust::tie(val.x, val.y, val.z);
}
template <class Op>
static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
{
return thrust::make_tuple(op, op, op);
}
};
template <> struct Unroll<4>
{
template <int BLOCK_SIZE, typename R>
static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
{
return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
}
template <typename R>
static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
{
return thrust::tie(val.x, val.y, val.z, val.w);
}
template <class Op>
static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
{
return thrust::make_tuple(op, op, op, op);
}
};
}
#endif // __UNROLL_DETAIL_HPP__