From 87ca607fd4d4b44b22d84b7fdb2c88cec3287dc8 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Wed, 3 Aug 2016 16:53:52 +0900 Subject: [PATCH] brush up convertFp16 * raise an error when wrong bit depth passed * raise an build error when wrong depth is specified for cvtScaleHalf_ * remove unnecessary safe check in cvtScaleHalf_ * use intrinsic instead of direct pointer access * update the explanation --- modules/core/include/opencv2/core.hpp | 5 ++-- modules/core/src/convert.cpp | 38 +++++++++++---------------- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp index d94b67877..88b06f391 100644 --- a/modules/core/include/opencv2/core.hpp +++ b/modules/core/include/opencv2/core.hpp @@ -526,8 +526,9 @@ CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst, /** @brief Converts an array to half precision floating number. -convertFp16 converts FP32 to FP16 or FP16 to FP32. The input array has to have type of CV_32F or -CV_16S to represent the bit depth. If the input array is neither of them, it'll do nothing. +This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point). The input array has to have type of CV_32F or +CV_16S to represent the bit depth. If the input array is neither of them, the function will raise an error. +The format of half precision floating point is defined in IEEE 754-2008. @param src input array. @param dst output array. diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index f6178d2bc..dc974505e 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -4547,20 +4547,7 @@ static short convertFp16SW(float fp32) // template for FP16 HW conversion function template static void -cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size) -{ - sstep /= sizeof(src[0]); - dstep /= sizeof(dst[0]); - - for( ; size.height--; src += sstep, dst += dstep ) - { - int x = 0; - - for ( ; x < size.width; x++ ) - { - } - } -} +cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size); template<> void cvtScaleHalf_( const float* src, size_t sstep, short* dst, size_t dstep, Size size) @@ -4574,23 +4561,25 @@ cvtScaleHalf_( const float* src, size_t sstep, short* dst, size_t { int x = 0; - if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 ) +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) + if ( ( (intptr_t)dst & 0xf ) == 0 ) +#endif { #if CV_FP16 for ( ; x <= size.width - 4; x += 4) { #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) - __m128 v_src = _mm_load_ps(src + x); + __m128 v_src = _mm_loadu_ps(src + x); __m128i v_dst = _mm_cvtps_ph(v_src, 0); _mm_storel_epi64((__m128i *)(dst + x), v_dst); #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) - float32x4_t v_src = *(float32x4_t*)(src + x); + float32x4_t v_src = vld1q_f32(src + x); float16x4_t v_dst = vcvt_f16_f32(v_src); - *(float16x4_t*)(dst + x) = v_dst; + vst1_f16((float16_t*)(dst + x), v_dst); #else #error "Configuration error" #endif @@ -4628,7 +4617,9 @@ cvtScaleHalf_( const short* src, size_t sstep, float* dst, size_t { int x = 0; - if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 && checkHardwareSupport(CV_CPU_FP16) ) +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) + if ( ( (intptr_t)src & 0xf ) == 0 ) +#endif { #if CV_FP16 for ( ; x <= size.width - 4; x += 4) @@ -4638,13 +4629,13 @@ cvtScaleHalf_( const short* src, size_t sstep, float* dst, size_t __m128 v_dst = _mm_cvtph_ps(v_src); - _mm_store_ps((dst + x), v_dst); + _mm_storeu_ps(dst + x, v_dst); #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) - float16x4_t v_src = *(float16x4_t*)(src + x); + float16x4_t v_src = vld1_f16((float16_t*)(src + x)); float32x4_t v_dst = vcvt_f32_f16(v_src); - *(float32x4_t*)(dst + x) = v_dst; + vst1q_f32(dst + x, v_dst); #else #error "Configuration error" #endif @@ -4761,7 +4752,7 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ dtype* dst, size_t dstep, Size size, double*) \ { \ - cvtScaleHalf##_(src, sstep, dst, dstep, size); \ + cvtScaleHalf_(src, sstep, dst, dstep, size); \ } #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \ @@ -5153,6 +5144,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst) ddepth = CV_32F; break; default: + CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth"); return; }