diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp new file mode 100644 index 000000000..eb6647dd7 --- /dev/null +++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp @@ -0,0 +1,39 @@ + +#if __CUDA_ARCH__ >= 200 + + // for Fermi memory space is detected automatically + template struct ForceGlobLoad + { + __device__ __forceinline__ static void Ld(T* ptr, int offset, T& val) { val = d_ptr[offset]; } + }; + +#else + + + #if defined(_WIN64) || defined(__LP64__) + // 64-bit register modifier for inlined asm + #define _OPENCV_ASM_PTR_ "l" + #else + // 32-bit register modifier for inlined asm + #define _OPENCV_ASM_PTR_ "r" + #endif + + template struct ForceGlobLoad; + + +#define DEFINE_FORCE_GLOB_LOAD(base_type, ptx_type, reg_mod) \ + template <> struct ForceGlobLoad \ + { \ + __device__ __forceinline__ static void Ld(type* ptr, int offset, type& val) \ + { \ + asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(d_ptr + offset)); \ + } \ + }; + + DEFINE_FORCE_GLOB_LOAD(int, s32, r) + DEFINE_FORCE_GLOB_LOAD(float, f32, f) + + +#undef DEFINE_FORCE_GLOB_LOAD + +#endif