brush up convertFp16

* raise an error when wrong bit depth passed * raise an build error when wrong depth is specified for cvtScaleHalf_ * remove unnecessary safe check in cvtScaleHalf_ * use intrinsic instead of direct pointer access * update the explanation
2016-08-03 16:53:52 +09:00 · 2016-08-03 16:53:52 +09:00 · 87ca607fd4
commit 87ca607fd4
parent da2810918c
2 changed files with 18 additions and 25 deletions
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -526,8 +526,9 @@ CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,

 /** @brief Converts an array to half precision floating number.

-convertFp16 converts FP32 to FP16 or FP16 to FP32.  The input array has to have type of CV_32F or
-CV_16S to represent the bit depth.  If the input array is neither of them, it'll do nothing.
+This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point).  The input array has to have type of CV_32F or
+CV_16S to represent the bit depth.  If the input array is neither of them, the function will raise an error.
+The format of half precision floating point is defined in IEEE 754-2008.

@param src input array.
@param dst output array.
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@ -4547,20 +4547,7 @@ static short convertFp16SW(float fp32)

 // template for FP16 HW conversion function
 template<typename T, typename DT> static void
-cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size)
-{
-    sstep /= sizeof(src[0]);
-    dstep /= sizeof(dst[0]);
-
-    for( ; size.height--; src += sstep, dst += dstep )
-    {
-        int x = 0;
-
-        for ( ; x < size.width; x++ )
-        {
-        }
-    }
-}
+cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);

 template<> void
 cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
@ -4574,23 +4561,25 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
        {
            int x = 0;

-            if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 )
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
+            if ( ( (intptr_t)dst & 0xf ) == 0 )
+#endif
            {
 #if CV_FP16
                for ( ; x <= size.width - 4; x += 4)
                {
 #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
-                    __m128 v_src = _mm_load_ps(src + x);
+                    __m128 v_src = _mm_loadu_ps(src + x);

                    __m128i v_dst = _mm_cvtps_ph(v_src, 0);

                    _mm_storel_epi64((__m128i *)(dst + x), v_dst);
 #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float32x4_t v_src = *(float32x4_t*)(src + x);
+                    float32x4_t v_src = vld1q_f32(src + x);

                    float16x4_t v_dst = vcvt_f16_f32(v_src);

-                    *(float16x4_t*)(dst + x) = v_dst;
+                    vst1_f16((float16_t*)(dst + x), v_dst);
 #else
 #error "Configuration error"
 #endif
@ -4628,7 +4617,9 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
        {
            int x = 0;

-            if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 && checkHardwareSupport(CV_CPU_FP16) )
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
+            if ( ( (intptr_t)src & 0xf ) == 0 )
+#endif
            {
 #if CV_FP16
                for ( ; x <= size.width - 4; x += 4)
@ -4638,13 +4629,13 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t

                    __m128 v_dst = _mm_cvtph_ps(v_src);

-                    _mm_store_ps((dst + x), v_dst);
+                    _mm_storeu_ps(dst + x, v_dst);
 #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float16x4_t v_src = *(float16x4_t*)(src + x);
+                    float16x4_t v_src = vld1_f16((float16_t*)(src + x));

                    float32x4_t v_dst = vcvt_f32_f16(v_src);

-                    *(float32x4_t*)(dst + x) = v_dst;
+                    vst1q_f32(dst + x, v_dst);
 #else
 #error "Configuration error"
 #endif
@ -4761,7 +4752,7 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
 static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 dtype* dst, size_t dstep, Size size, double*) \
 { \
-    cvtScaleHalf##_<stype,dtype>(src, sstep, dst, dstep, size); \
+    cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
 }

 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
@ -5153,6 +5144,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
        ddepth = CV_32F;
        break;
    default:
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
        return;
    }