From 54337fd51337f434292d705176b6a7165c655a61 Mon Sep 17 00:00:00 2001 From: Anton Obukhov Date: Sun, 13 Nov 2011 20:52:48 +0000 Subject: [PATCH] [+] CUDA path for NCVImagePyramid --- modules/gpu/src/nvidia/core/NCV.hpp | 1 + modules/gpu/src/nvidia/core/NCVAlg.hpp | 9 + .../src/nvidia/core/NCVPixelOperations.hpp | 162 ++++++++-------- modules/gpu/src/nvidia/core/NCVPyramid.cu | 183 +++++++++++++++--- 4 files changed, 251 insertions(+), 104 deletions(-) diff --git a/modules/gpu/src/nvidia/core/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp index 935ebcfdf..aa3de2747 100644 --- a/modules/gpu/src/nvidia/core/NCV.hpp +++ b/modules/gpu/src/nvidia/core/NCV.hpp @@ -331,6 +331,7 @@ enum NCV_HAAR_XML_LOADING_EXCEPTION, NCV_NOIMPL_HAAR_TILTED_FEATURES, + NCV_NOT_IMPLEMENTED, NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW, diff --git a/modules/gpu/src/nvidia/core/NCVAlg.hpp b/modules/gpu/src/nvidia/core/NCVAlg.hpp index 7c6cebd08..6a14be04d 100644 --- a/modules/gpu/src/nvidia/core/NCVAlg.hpp +++ b/modules/gpu/src/nvidia/core/NCVAlg.hpp @@ -45,6 +45,15 @@ #include "NCV.hpp" +template +static void swap(T &p1, T &p2) +{ + T tmp = p1; + p1 = p2; + p2 = tmp; +} + + template static T divUp(T a, T b) { diff --git a/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp b/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp index 3951a2f43..5096db9e3 100644 --- a/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp +++ b/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp @@ -46,25 +46,25 @@ #include #include "NCV.hpp" -template inline TBase _pixMaxVal(); -template<> static inline Ncv8u _pixMaxVal() {return UCHAR_MAX;} -template<> static inline Ncv16u _pixMaxVal() {return USHRT_MAX;} -template<> static inline Ncv32u _pixMaxVal() {return UINT_MAX;} -template<> static inline Ncv8s _pixMaxVal() {return CHAR_MAX;} -template<> static inline Ncv16s _pixMaxVal() {return SHRT_MAX;} -template<> static inline Ncv32s _pixMaxVal() {return INT_MAX;} -template<> static inline Ncv32f _pixMaxVal() {return FLT_MAX;} -template<> static inline Ncv64f _pixMaxVal() {return DBL_MAX;} +template inline __host__ __device__ TBase _pixMaxVal(); +template<> static inline __host__ __device__ Ncv8u _pixMaxVal() {return UCHAR_MAX;} +template<> static inline __host__ __device__ Ncv16u _pixMaxVal() {return USHRT_MAX;} +template<> static inline __host__ __device__ Ncv32u _pixMaxVal() {return UINT_MAX;} +template<> static inline __host__ __device__ Ncv8s _pixMaxVal() {return CHAR_MAX;} +template<> static inline __host__ __device__ Ncv16s _pixMaxVal() {return SHRT_MAX;} +template<> static inline __host__ __device__ Ncv32s _pixMaxVal() {return INT_MAX;} +template<> static inline __host__ __device__ Ncv32f _pixMaxVal() {return FLT_MAX;} +template<> static inline __host__ __device__ Ncv64f _pixMaxVal() {return DBL_MAX;} -template inline TBase _pixMinVal(); -template<> static inline Ncv8u _pixMinVal() {return 0;} -template<> static inline Ncv16u _pixMinVal() {return 0;} -template<> static inline Ncv32u _pixMinVal() {return 0;} -template<> static inline Ncv8s _pixMinVal() {return CHAR_MIN;} -template<> static inline Ncv16s _pixMinVal() {return SHRT_MIN;} -template<> static inline Ncv32s _pixMinVal() {return INT_MIN;} -template<> static inline Ncv32f _pixMinVal() {return FLT_MIN;} -template<> static inline Ncv64f _pixMinVal() {return DBL_MIN;} +template inline __host__ __device__ TBase _pixMinVal(); +template<> static inline __host__ __device__ Ncv8u _pixMinVal() {return 0;} +template<> static inline __host__ __device__ Ncv16u _pixMinVal() {return 0;} +template<> static inline __host__ __device__ Ncv32u _pixMinVal() {return 0;} +template<> static inline __host__ __device__ Ncv8s _pixMinVal() {return CHAR_MIN;} +template<> static inline __host__ __device__ Ncv16s _pixMinVal() {return SHRT_MIN;} +template<> static inline __host__ __device__ Ncv32s _pixMinVal() {return INT_MIN;} +template<> static inline __host__ __device__ Ncv32f _pixMinVal() {return FLT_MIN;} +template<> static inline __host__ __device__ Ncv64f _pixMinVal() {return DBL_MIN;} template struct TConvVec2Base; template<> struct TConvVec2Base {typedef Ncv8u TBase;}; @@ -103,55 +103,55 @@ template<> struct TConvBase2Vec {typedef double3 TVec;}; template<> struct TConvBase2Vec {typedef double4 TVec;}; //TODO: consider using CUDA intrinsics to avoid branching -template static inline void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);}; -template static inline void _TDemoteClampZ(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a, 0, USHRT_MAX);} -template static inline void _TDemoteClampZ(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a, 0, UINT_MAX);} -template static inline void _TDemoteClampZ(Tin &a, Ncv32f &out) {out = (Ncv32f)a;} +template static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);}; +template static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a, 0, USHRT_MAX);} +template static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a, 0, UINT_MAX);} +template static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32f &out) {out = (Ncv32f)a;} //TODO: consider using CUDA intrinsics to avoid branching -template static inline void _TDemoteClampNN(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a+0.5f);} -template static inline void _TDemoteClampNN(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a+0.5f, 0, USHRT_MAX);} -template static inline void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);} -template static inline void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;} +template static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a+0.5f);} +template static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a+0.5f, 0, USHRT_MAX);} +template static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);} +template static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;} template inline Tout _pixMakeZero(); -template<> static inline uchar1 _pixMakeZero() {return make_uchar1(0);} -template<> static inline uchar3 _pixMakeZero() {return make_uchar3(0,0,0);} -template<> static inline uchar4 _pixMakeZero() {return make_uchar4(0,0,0,0);} -template<> static inline ushort1 _pixMakeZero() {return make_ushort1(0);} -template<> static inline ushort3 _pixMakeZero() {return make_ushort3(0,0,0);} -template<> static inline ushort4 _pixMakeZero() {return make_ushort4(0,0,0,0);} -template<> static inline uint1 _pixMakeZero() {return make_uint1(0);} -template<> static inline uint3 _pixMakeZero() {return make_uint3(0,0,0);} -template<> static inline uint4 _pixMakeZero() {return make_uint4(0,0,0,0);} -template<> static inline float1 _pixMakeZero() {return make_float1(0.f);} -template<> static inline float3 _pixMakeZero() {return make_float3(0.f,0.f,0.f);} -template<> static inline float4 _pixMakeZero() {return make_float4(0.f,0.f,0.f,0.f);} -template<> static inline double1 _pixMakeZero() {return make_double1(0.);} -template<> static inline double3 _pixMakeZero() {return make_double3(0.,0.,0.);} -template<> static inline double4 _pixMakeZero() {return make_double4(0.,0.,0.,0.);} +template<> static inline __host__ __device__ uchar1 _pixMakeZero() {return make_uchar1(0);} +template<> static inline __host__ __device__ uchar3 _pixMakeZero() {return make_uchar3(0,0,0);} +template<> static inline __host__ __device__ uchar4 _pixMakeZero() {return make_uchar4(0,0,0,0);} +template<> static inline __host__ __device__ ushort1 _pixMakeZero() {return make_ushort1(0);} +template<> static inline __host__ __device__ ushort3 _pixMakeZero() {return make_ushort3(0,0,0);} +template<> static inline __host__ __device__ ushort4 _pixMakeZero() {return make_ushort4(0,0,0,0);} +template<> static inline __host__ __device__ uint1 _pixMakeZero() {return make_uint1(0);} +template<> static inline __host__ __device__ uint3 _pixMakeZero() {return make_uint3(0,0,0);} +template<> static inline __host__ __device__ uint4 _pixMakeZero() {return make_uint4(0,0,0,0);} +template<> static inline __host__ __device__ float1 _pixMakeZero() {return make_float1(0.f);} +template<> static inline __host__ __device__ float3 _pixMakeZero() {return make_float3(0.f,0.f,0.f);} +template<> static inline __host__ __device__ float4 _pixMakeZero() {return make_float4(0.f,0.f,0.f,0.f);} +template<> static inline __host__ __device__ double1 _pixMakeZero() {return make_double1(0.);} +template<> static inline __host__ __device__ double3 _pixMakeZero() {return make_double3(0.,0.,0.);} +template<> static inline __host__ __device__ double4 _pixMakeZero() {return make_double4(0.,0.,0.,0.);} -static inline uchar1 _pixMake(Ncv8u x) {return make_uchar1(x);} -static inline uchar3 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z) {return make_uchar3(x,y,z);} -static inline uchar4 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z, Ncv8u w) {return make_uchar4(x,y,z,w);} -static inline ushort1 _pixMake(Ncv16u x) {return make_ushort1(x);} -static inline ushort3 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z) {return make_ushort3(x,y,z);} -static inline ushort4 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z, Ncv16u w) {return make_ushort4(x,y,z,w);} -static inline uint1 _pixMake(Ncv32u x) {return make_uint1(x);} -static inline uint3 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z) {return make_uint3(x,y,z);} -static inline uint4 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z, Ncv32u w) {return make_uint4(x,y,z,w);} -static inline float1 _pixMake(Ncv32f x) {return make_float1(x);} -static inline float3 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z) {return make_float3(x,y,z);} -static inline float4 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z, Ncv32f w) {return make_float4(x,y,z,w);} -static inline double1 _pixMake(Ncv64f x) {return make_double1(x);} -static inline double3 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z) {return make_double3(x,y,z);} -static inline double4 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z, Ncv64f w) {return make_double4(x,y,z,w);} +static inline __host__ __device__ uchar1 _pixMake(Ncv8u x) {return make_uchar1(x);} +static inline __host__ __device__ uchar3 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z) {return make_uchar3(x,y,z);} +static inline __host__ __device__ uchar4 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z, Ncv8u w) {return make_uchar4(x,y,z,w);} +static inline __host__ __device__ ushort1 _pixMake(Ncv16u x) {return make_ushort1(x);} +static inline __host__ __device__ ushort3 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z) {return make_ushort3(x,y,z);} +static inline __host__ __device__ ushort4 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z, Ncv16u w) {return make_ushort4(x,y,z,w);} +static inline __host__ __device__ uint1 _pixMake(Ncv32u x) {return make_uint1(x);} +static inline __host__ __device__ uint3 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z) {return make_uint3(x,y,z);} +static inline __host__ __device__ uint4 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z, Ncv32u w) {return make_uint4(x,y,z,w);} +static inline __host__ __device__ float1 _pixMake(Ncv32f x) {return make_float1(x);} +static inline __host__ __device__ float3 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z) {return make_float3(x,y,z);} +static inline __host__ __device__ float4 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z, Ncv32f w) {return make_float4(x,y,z,w);} +static inline __host__ __device__ double1 _pixMake(Ncv64f x) {return make_double1(x);} +static inline __host__ __device__ double3 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z) {return make_double3(x,y,z);} +static inline __host__ __device__ double4 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z, Ncv64f w) {return make_double4(x,y,z,w);} -template struct __pixDemoteClampZ_CN {static Tout _pixDemoteClampZ_CN(Tin &pix);}; +template struct __pixDemoteClampZ_CN {static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix);}; template struct __pixDemoteClampZ_CN { -static Tout _pixDemoteClampZ_CN(Tin &pix) +static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix) { Tout out; _TDemoteClampZ(pix.x, out.x); @@ -159,7 +159,7 @@ static Tout _pixDemoteClampZ_CN(Tin &pix) }}; template struct __pixDemoteClampZ_CN { -static Tout _pixDemoteClampZ_CN(Tin &pix) +static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix) { Tout out; _TDemoteClampZ(pix.x, out.x); @@ -169,7 +169,7 @@ static Tout _pixDemoteClampZ_CN(Tin &pix) }}; template struct __pixDemoteClampZ_CN { -static Tout _pixDemoteClampZ_CN(Tin &pix) +static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix) { Tout out; _TDemoteClampZ(pix.x, out.x); @@ -179,16 +179,16 @@ static Tout _pixDemoteClampZ_CN(Tin &pix) return out; }}; -template static inline Tout _pixDemoteClampZ(Tin &pix) +template static inline __host__ __device__ Tout _pixDemoteClampZ(Tin &pix) { return __pixDemoteClampZ_CN::_pixDemoteClampZ_CN(pix); } -template struct __pixDemoteClampNN_CN {static Tout _pixDemoteClampNN_CN(Tin &pix);}; +template struct __pixDemoteClampNN_CN {static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix);}; template struct __pixDemoteClampNN_CN { -static Tout _pixDemoteClampNN_CN(Tin &pix) +static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix) { Tout out; _TDemoteClampNN(pix.x, out.x); @@ -196,7 +196,7 @@ static Tout _pixDemoteClampNN_CN(Tin &pix) }}; template struct __pixDemoteClampNN_CN { -static Tout _pixDemoteClampNN_CN(Tin &pix) +static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix) { Tout out; _TDemoteClampNN(pix.x, out.x); @@ -206,7 +206,7 @@ static Tout _pixDemoteClampNN_CN(Tin &pix) }}; template struct __pixDemoteClampNN_CN { -static Tout _pixDemoteClampNN_CN(Tin &pix) +static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix) { Tout out; _TDemoteClampNN(pix.x, out.x); @@ -216,16 +216,16 @@ static Tout _pixDemoteClampNN_CN(Tin &pix) return out; }}; -template static inline Tout _pixDemoteClampNN(Tin &pix) +template static inline __host__ __device__ Tout _pixDemoteClampNN(Tin &pix) { return __pixDemoteClampNN_CN::_pixDemoteClampNN_CN(pix); } -template struct __pixScale_CN {static Tout _pixScale_CN(Tin &pix, Tw w);}; +template struct __pixScale_CN {static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w);}; template struct __pixScale_CN { -static Tout _pixScale_CN(Tin &pix, Tw w) +static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w) { Tout out; typedef typename TConvVec2Base::TBase TBout; @@ -234,7 +234,7 @@ static Tout _pixScale_CN(Tin &pix, Tw w) }}; template struct __pixScale_CN { -static Tout _pixScale_CN(Tin &pix, Tw w) +static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w) { Tout out; typedef typename TConvVec2Base::TBase TBout; @@ -245,7 +245,7 @@ static Tout _pixScale_CN(Tin &pix, Tw w) }}; template struct __pixScale_CN { -static Tout _pixScale_CN(Tin &pix, Tw w) +static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w) { Tout out; typedef typename TConvVec2Base::TBase TBout; @@ -256,16 +256,16 @@ static Tout _pixScale_CN(Tin &pix, Tw w) return out; }}; -template static Tout _pixScale(Tin &pix, Tw w) +template static __host__ __device__ Tout _pixScale(Tin &pix, Tw w) { return __pixScale_CN::_pixScale_CN(pix, w); } -template struct __pixAdd_CN {static Tout _pixAdd_CN(Tout &pix1, Tin &pix2);}; +template struct __pixAdd_CN {static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2);}; template struct __pixAdd_CN { -static Tout _pixAdd_CN(Tout &pix1, Tin &pix2) +static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2) { Tout out; out.x = pix1.x + pix2.x; @@ -273,7 +273,7 @@ static Tout _pixAdd_CN(Tout &pix1, Tin &pix2) }}; template struct __pixAdd_CN { -static Tout _pixAdd_CN(Tout &pix1, Tin &pix2) +static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2) { Tout out; out.x = pix1.x + pix2.x; @@ -283,7 +283,7 @@ static Tout _pixAdd_CN(Tout &pix1, Tin &pix2) }}; template struct __pixAdd_CN { -static Tout _pixAdd_CN(Tout &pix1, Tin &pix2) +static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2) { Tout out; out.x = pix1.x + pix2.x; @@ -293,33 +293,33 @@ static Tout _pixAdd_CN(Tout &pix1, Tin &pix2) return out; }}; -template static Tout _pixAdd(Tout &pix1, Tin &pix2) +template static __host__ __device__ Tout _pixAdd(Tout &pix1, Tin &pix2) { return __pixAdd_CN::_pixAdd_CN(pix1, pix2); } -template struct __pixDist_CN {static Tout _pixDist_CN(Tin &pix1, Tin &pix2);}; +template struct __pixDist_CN {static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2);}; template struct __pixDist_CN { -static Tout _pixDist_CN(Tin &pix1, Tin &pix2) +static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2) { return Tout(SQR(pix1.x - pix2.x)); }}; template struct __pixDist_CN { -static Tout _pixDist_CN(Tin &pix1, Tin &pix2) +static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2) { return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z)); }}; template struct __pixDist_CN { -static Tout _pixDist_CN(Tin &pix1, Tin &pix2) +static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2) { return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z) + SQR(pix1.w - pix2.w)); }}; -template static Tout _pixDist(Tin &pix1, Tin &pix2) +template static __host__ __device__ Tout _pixDist(Tin &pix1, Tin &pix2) { return __pixDist_CN::_pixDist_CN(pix1, pix2); } diff --git a/modules/gpu/src/nvidia/core/NCVPyramid.cu b/modules/gpu/src/nvidia/core/NCVPyramid.cu index 463178362..5a2367798 100644 --- a/modules/gpu/src/nvidia/core/NCVPyramid.cu +++ b/modules/gpu/src/nvidia/core/NCVPyramid.cu @@ -43,15 +43,16 @@ #include #include #include "NCV.hpp" +#include "NCVAlg.hpp" #include "NCVPyramid.hpp" #include "NCVPixelOperations.hpp" -#ifdef _WIN32 +#ifdef _WIN32 -template struct __average4_CN {static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);}; +template struct __average4_CN {static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);}; template struct __average4_CN { -static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11) +static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11) { T out; out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4; @@ -59,7 +60,7 @@ static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11) }}; template<> struct __average4_CN { -static float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11) +static __host__ __device__ float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11) { float1 out; out.x = (p00.x + p01.x + p10.x + p11.x) / 4; @@ -67,7 +68,7 @@ static float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p }}; template<> struct __average4_CN { -static double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11) +static __host__ __device__ double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11) { double1 out; out.x = (p00.x + p01.x + p10.x + p11.x) / 4; @@ -75,7 +76,7 @@ static double1 _average4_CN(const double1 &p00, const double1 &p01, const double }}; template struct __average4_CN { -static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11) +static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11) { T out; out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4; @@ -85,7 +86,7 @@ static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11) }}; template<> struct __average4_CN { -static float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11) +static __host__ __device__ float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11) { float3 out; out.x = (p00.x + p01.x + p10.x + p11.x) / 4; @@ -95,7 +96,7 @@ static float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p }}; template<> struct __average4_CN { -static double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11) +static __host__ __device__ double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11) { double3 out; out.x = (p00.x + p01.x + p10.x + p11.x) / 4; @@ -105,7 +106,7 @@ static double3 _average4_CN(const double3 &p00, const double3 &p01, const double }}; template struct __average4_CN { -static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11) +static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11) { T out; out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4; @@ -116,7 +117,7 @@ static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11) }}; template<> struct __average4_CN { -static float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11) +static __host__ __device__ float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11) { float4 out; out.x = (p00.x + p01.x + p10.x + p11.x) / 4; @@ -127,7 +128,7 @@ static float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p }}; template<> struct __average4_CN { -static double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11) +static __host__ __device__ double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11) { double4 out; out.x = (p00.x + p01.x + p10.x + p11.x) / 4; @@ -137,23 +138,23 @@ static double4 _average4_CN(const double4 &p00, const double4 &p01, const double return out; }}; -template static T _average4(const T &p00, const T &p01, const T &p10, const T &p11) +template static __host__ __device__ T _average4(const T &p00, const T &p01, const T &p10, const T &p11) { return __average4_CN::_average4_CN(p00, p01, p10, p11); } -template struct __lerp_CN {static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);}; +template struct __lerp_CN {static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);}; template struct __lerp_CN { -static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d) +static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d) { typedef typename TConvVec2Base::TBase TB; return _pixMake(TB(b.x * d + a.x * (1 - d))); }}; template struct __lerp_CN { -static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d) +static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d) { typedef typename TConvVec2Base::TBase TB; return _pixMake(TB(b.x * d + a.x * (1 - d)), @@ -162,7 +163,7 @@ static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d) }}; template struct __lerp_CN { -static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d) +static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d) { typedef typename TConvVec2Base::TBase TB; return _pixMake(TB(b.x * d + a.x * (1 - d)), @@ -171,7 +172,7 @@ static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d) TB(b.w * d + a.w * (1 - d))); }}; -template static Tout _lerp(const Tin &a, const Tin &b, Ncv32f d) +template static __host__ __device__ Tout _lerp(const Tin &a, const Tin &b, Ncv32f d) { return __lerp_CN::_lerp_CN(a, b, d); } @@ -208,6 +209,74 @@ static T _interpBilinear(const NCVMatrix &refLayer, Ncv32f x, Ncv32f y) } +template +__global__ void kernelDownsampleX2(T *d_src, + Ncv32u srcPitch, + T *d_dst, + Ncv32u dstPitch, + NcvSize32u dstRoi) +{ + Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y; + Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < dstRoi.height && j < dstRoi.width) + { + T *d_src_line1 = (T *)((Ncv8u *)d_src + (2 * i + 0) * srcPitch); + T *d_src_line2 = (T *)((Ncv8u *)d_src + (2 * i + 1) * srcPitch); + T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch); + + T p00 = d_src_line1[2*j+0]; + T p01 = d_src_line1[2*j+1]; + T p10 = d_src_line2[2*j+0]; + T p11 = d_src_line2[2*j+1]; + + d_dst_line[j] = _average4(p00, p01, p10, p11); + } +} + + +template +__global__ void kernelInterpolateFrom1(T *d_srcTop, + Ncv32u srcTopPitch, + NcvSize32u szTopRoi, + T *d_dst, + Ncv32u dstPitch, + NcvSize32u dstRoi) +{ + Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y; + Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < dstRoi.height && j < dstRoi.width) + { + Ncv32f ptTopX = 1.0f * (szTopRoi.width - 1) * j / (dstRoi.width - 1); + Ncv32f ptTopY = 1.0f * (szTopRoi.height - 1) * i / (dstRoi.height - 1); + Ncv32u xl = (Ncv32u)ptTopX; + Ncv32u xh = xl+1; + Ncv32f dx = ptTopX - xl; + Ncv32u yl = (Ncv32u)ptTopY; + Ncv32u yh = yl+1; + Ncv32f dy = ptTopY - yl; + + T *d_src_line1 = (T *)((Ncv8u *)d_srcTop + yl * srcTopPitch); + T *d_src_line2 = (T *)((Ncv8u *)d_srcTop + yh * srcTopPitch); + T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch); + + T p00, p01, p10, p11; + p00 = d_src_line1[xl]; + p01 = xh < szTopRoi.width ? d_src_line1[xh] : p00; + p10 = yh < szTopRoi.height ? d_src_line2[xl] : p00; + p11 = (xh < szTopRoi.width && yh < szTopRoi.height) ? d_src_line2[xh] : p00; + typedef typename TConvBase2Vec::TVec TVFlt; + TVFlt m_00_01 = _lerp(p00, p01, dx); + TVFlt m_10_11 = _lerp(p10, p11, dx); + TVFlt mixture = _lerp(m_00_01, m_10_11, dy); + T outPix = _pixDemoteClampZ(mixture); + + d_dst_line[j] = outPix; + } +} + + template NCVImagePyramid::NCVImagePyramid(const NCVMatrix &img, Ncv8u numLayers, @@ -215,7 +284,7 @@ NCVImagePyramid::NCVImagePyramid(const NCVMatrix &img, cudaStream_t cuStream) { this->_isInitialized = false; - ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid_host::ctor error", ); + ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid::ctor error", ); this->layer0 = &img; NcvSize32u szLastLayer(img.width(), img.height()); @@ -229,6 +298,10 @@ NCVImagePyramid::NCVImagePyramid(const NCVMatrix &img, numLayers = 255; //it will cut-off when any of the dimensions goes 1 } +#ifdef SELF_CHECK_GPU + NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512); +#endif + for (Ncv32u i=0; i<(Ncv32u)numLayers-1; i++) { NcvSize32u szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2); @@ -238,7 +311,7 @@ NCVImagePyramid::NCVImagePyramid(const NCVMatrix &img, } this->pyramid.push_back(new NCVMatrixAlloc(alloc, szCurLayer.width, szCurLayer.height)); - ncvAssertPrintReturn(((NCVMatrixAlloc *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid_host::ctor error", ); + ncvAssertPrintReturn(((NCVMatrixAlloc *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid::ctor error", ); this->nLayers++; //fill in the layer @@ -249,7 +322,37 @@ NCVImagePyramid::NCVImagePyramid(const NCVMatrix &img, if (bDeviceCode) { - //TODO: in cuStream + dim3 bDim(16, 8); + dim3 gDim(divUp(szCurLayer.width, bDim.x), divUp(szCurLayer.height, bDim.y)); + kernelDownsampleX2<<>>(prevLayer->ptr(), + prevLayer->pitch(), + curLayer->ptr(), + curLayer->pitch(), + szCurLayer); + ncvAssertPrintReturn(cudaSuccess == cudaGetLastError(), "NCVImagePyramid::ctor error", ); + +#ifdef SELF_CHECK_GPU + NCVMatrixAlloc h_prevLayer(allocCPU, prevLayer->width(), prevLayer->height()); + ncvAssertPrintReturn(h_prevLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", ); + NCVMatrixAlloc h_curLayer(allocCPU, curLayer->width(), curLayer->height()); + ncvAssertPrintReturn(h_curLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", ); + ncvAssertPrintReturn(NCV_SUCCESS == prevLayer->copy2D(h_prevLayer, prevLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", ); + ncvAssertPrintReturn(NCV_SUCCESS == curLayer->copy2D(h_curLayer, curLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", ); + ncvAssertPrintReturn(cudaSuccess == cudaStreamSynchronize(cuStream), "Validation failure in NCVImagePyramid::ctor", ); + for (Ncv32u i=0; i::getLayer(NCVMatrix &outImg, NCV_SET_SKIP_COND(outImg.memType() == NCVMemoryTypeNone); NcvBool bDeviceCode = this->layer0->memType() == NCVMemoryTypeDevice; +#ifdef SELF_CHECK_GPU + NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512); +#endif + NCV_SKIP_COND_BEGIN if (bDeviceCode) { - //TODO: in cuStream + ncvAssertReturn(bUse2Refs == false, NCV_NOT_IMPLEMENTED); + + dim3 bDim(16, 8); + dim3 gDim(divUp(outRoi.width, bDim.x), divUp(outRoi.height, bDim.y)); + kernelInterpolateFrom1<<>>(lastLayer->ptr(), + lastLayer->pitch(), + lastLayer->size(), + outImg.ptr(), + outImg.pitch(), + outRoi); + ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); + +#ifdef SELF_CHECK_GPU + ncvSafeMatAlloc(h_lastLayer, T, allocCPU, lastLayer->width(), lastLayer->height(), NCV_ALLOCATOR_BAD_ALLOC); + ncvSafeMatAlloc(h_outImg, T, allocCPU, outImg.width(), outImg.height(), NCV_ALLOCATOR_BAD_ALLOC); + ncvAssertReturnNcvStat(lastLayer->copy2D(h_lastLayer, lastLayer->size(), cuStream)); + ncvAssertReturnNcvStat(outImg.copy2D(h_outImg, outRoi, cuStream)); + ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); + + for (Ncv32u i=0; iwidth(), lastLayer->height()); + Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1); + Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1); + T outGold = _interpBilinear(h_lastLayer, ptTopX, ptTopY); + ncvAssertPrintReturn(0 == memcmp(&outGold, &h_outImg.at(j,i), sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelInterpolateFrom1", NCV_UNKNOWN_ERROR); + } + } +#endif } else { @@ -395,6 +532,6 @@ template class NCVImagePyramid; template class NCVImagePyramid; template class NCVImagePyramid; template class NCVImagePyramid; -template class NCVImagePyramid; - +template class NCVImagePyramid; + #endif //_WIN32