Merge pull request #7033 from tomoaki0705:brushUpFp16

2016-08-04 09:31:52 +00:00 · 2016-08-04 09:31:52 +00:00 · b2698f24b0
commit b2698f24b0
parent 7eee5cb724 87ca607fd4
2 changed files with 18 additions and 25 deletions
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -526,8 +526,9 @@ CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
 /** @brief Converts an array to half precision floating number.
-convertFp16 converts FP32 to FP16 or FP16 to FP32.  The input array has to have type of CV_32F or
+This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point).  The input array has to have type of CV_32F or
-CV_16S to represent the bit depth.  If the input array is neither of them, it'll do nothing.
+CV_16S to represent the bit depth.  If the input array is neither of them, the function will raise an error.
 The format of half precision floating point is defined in IEEE 754-2008.
@param src input array.
@param dst output array.
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@ -4547,20 +4547,7 @@ static short convertFp16SW(float fp32)
 // template for FP16 HW conversion function
 template<typename T, typename DT> static void
-cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size)
+cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
 {
    sstep /= sizeof(src[0]);
    dstep /= sizeof(dst[0]);
    for( ; size.height--; src += sstep, dst += dstep )
    {
        int x = 0;
        for ( ; x < size.width; x++ )
        {
        }
    }
 }
 template<> void
 cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
@ -4574,23 +4561,25 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
        {
            int x = 0;
-            if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 )
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
            if ( ( (intptr_t)dst & 0xf ) == 0 )
 #endif
            {
 #if CV_FP16
                for ( ; x <= size.width - 4; x += 4)
                {
 #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
-                    __m128 v_src = _mm_load_ps(src + x);
+                    __m128 v_src = _mm_loadu_ps(src + x);
                    __m128i v_dst = _mm_cvtps_ph(v_src, 0);
                    _mm_storel_epi64((__m128i *)(dst + x), v_dst);
 #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float32x4_t v_src = *(float32x4_t*)(src + x);
+                    float32x4_t v_src = vld1q_f32(src + x);
                    float16x4_t v_dst = vcvt_f16_f32(v_src);
-                    *(float16x4_t*)(dst + x) = v_dst;
+                    vst1_f16((float16_t*)(dst + x), v_dst);
 #else
 #error "Configuration error"
 #endif
@ -4628,7 +4617,9 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
        {
            int x = 0;
-            if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 && checkHardwareSupport(CV_CPU_FP16) )
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
            if ( ( (intptr_t)src & 0xf ) == 0 )
 #endif
            {
 #if CV_FP16
                for ( ; x <= size.width - 4; x += 4)
@ -4638,13 +4629,13 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
                    __m128 v_dst = _mm_cvtph_ps(v_src);
-                    _mm_store_ps((dst + x), v_dst);
+                    _mm_storeu_ps(dst + x, v_dst);
 #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float16x4_t v_src = *(float16x4_t*)(src + x);
+                    float16x4_t v_src = vld1_f16((float16_t*)(src + x));
                    float32x4_t v_dst = vcvt_f32_f16(v_src);
-                    *(float32x4_t*)(dst + x) = v_dst;
+                    vst1q_f32(dst + x, v_dst);
 #else
 #error "Configuration error"
 #endif
@ -4761,7 +4752,7 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
 static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 dtype* dst, size_t dstep, Size size, double*) \
 { \
-    cvtScaleHalf##_<stype,dtype>(src, sstep, dst, dstep, size); \
+    cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
 }
 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
@ -5153,6 +5144,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
        ddepth = CV_32F;
        break;
    default:
        CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
        return;
    }