From 87ca607fd4d4b44b22d84b7fdb2c88cec3287dc8 Mon Sep 17 00:00:00 2001
From: Tomoaki Teshima <teshima@exvision.co.jp>
Date: Wed, 3 Aug 2016 16:53:52 +0900
Subject: [PATCH] brush up convertFp16   * raise an error when wrong bit depth
 passed   * raise an build error when wrong depth is specified for
 cvtScaleHalf_   * remove unnecessary safe check in cvtScaleHalf_   * use
 intrinsic instead of direct pointer access   * update the explanation

---
 modules/core/include/opencv2/core.hpp |  5 ++--
 modules/core/src/convert.cpp          | 38 +++++++++++----------------
 2 files changed, 18 insertions(+), 25 deletions(-)
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index d94b67877..88b06f391 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -526,8 +526,9 @@ CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
 
 /** @brief Converts an array to half precision floating number.
 
-convertFp16 converts FP32 to FP16 or FP16 to FP32.  The input array has to have type of CV_32F or
-CV_16S to represent the bit depth.  If the input array is neither of them, it'll do nothing.
+This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point).  The input array has to have type of CV_32F or
+CV_16S to represent the bit depth.  If the input array is neither of them, the function will raise an error.
+The format of half precision floating point is defined in IEEE 754-2008.
 
 @param src input array.
 @param dst output array.
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index f6178d2bc..dc974505e 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -4547,20 +4547,7 @@ static short convertFp16SW(float fp32)
 
 // template for FP16 HW conversion function
 template<typename T, typename DT> static void
-cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size)
-{
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-
-        for ( ; x < size.width; x++ )
-        {
-        }
-    }
-}
+cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
 
 template<> void
 cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
@@ -4574,23 +4561,25 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
         {
             int x = 0;
 
-            if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 )
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
+            if ( ( (intptr_t)dst & 0xf ) == 0 )
+#endif
             {
 #if CV_FP16
                 for ( ; x <= size.width - 4; x += 4)
                 {
 #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
-                    __m128 v_src = _mm_load_ps(src + x);
+                    __m128 v_src = _mm_loadu_ps(src + x);
 
                     __m128i v_dst = _mm_cvtps_ph(v_src, 0);
 
                     _mm_storel_epi64((__m128i *)(dst + x), v_dst);
 #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float32x4_t v_src = *(float32x4_t*)(src + x);
+                    float32x4_t v_src = vld1q_f32(src + x);
 
                     float16x4_t v_dst = vcvt_f16_f32(v_src);
 
-                    *(float16x4_t*)(dst + x) = v_dst;
+                    vst1_f16((float16_t*)(dst + x), v_dst);
 #else
 #error "Configuration error"
 #endif
@@ -4628,7 +4617,9 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
         {
             int x = 0;
 
-            if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 && checkHardwareSupport(CV_CPU_FP16) )
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
+            if ( ( (intptr_t)src & 0xf ) == 0 )
+#endif
             {
 #if CV_FP16
                 for ( ; x <= size.width - 4; x += 4)
@@ -4638,13 +4629,13 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
 
                     __m128 v_dst = _mm_cvtph_ps(v_src);
 
-                    _mm_store_ps((dst + x), v_dst);
+                    _mm_storeu_ps(dst + x, v_dst);
 #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float16x4_t v_src = *(float16x4_t*)(src + x);
+                    float16x4_t v_src = vld1_f16((float16_t*)(src + x));
 
                     float32x4_t v_dst = vcvt_f32_f16(v_src);
 
-                    *(float32x4_t*)(dst + x) = v_dst;
+                    vst1q_f32(dst + x, v_dst);
 #else
 #error "Configuration error"
 #endif
@@ -4761,7 +4752,7 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
 static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 dtype* dst, size_t dstep, Size size, double*) \
 { \
-    cvtScaleHalf##_<stype,dtype>(src, sstep, dst, dstep, size); \
+    cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
 }
 
 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
@@ -5153,6 +5144,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
         ddepth = CV_32F;
         break;
     default:
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
         return;
     }