brush up convertFp16

* raise an error when wrong bit depth passed
  * raise an build error when wrong depth is specified for cvtScaleHalf_
  * remove unnecessary safe check in cvtScaleHalf_
  * use intrinsic instead of direct pointer access
  * update the explanation
This commit is contained in:
Tomoaki Teshima 2016-08-03 16:53:52 +09:00 committed by Tomoaki Teshima
parent da2810918c
commit 87ca607fd4
2 changed files with 18 additions and 25 deletions

View File

@ -526,8 +526,9 @@ CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
/** @brief Converts an array to half precision floating number.
convertFp16 converts FP32 to FP16 or FP16 to FP32. The input array has to have type of CV_32F or
CV_16S to represent the bit depth. If the input array is neither of them, it'll do nothing.
This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point). The input array has to have type of CV_32F or
CV_16S to represent the bit depth. If the input array is neither of them, the function will raise an error.
The format of half precision floating point is defined in IEEE 754-2008.
@param src input array.
@param dst output array.

View File

@ -4547,20 +4547,7 @@ static short convertFp16SW(float fp32)
// template for FP16 HW conversion function
template<typename T, typename DT> static void
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size)
{
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
for ( ; x < size.width; x++ )
{
}
}
}
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
template<> void
cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
@ -4574,23 +4561,25 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
{
int x = 0;
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 )
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
if ( ( (intptr_t)dst & 0xf ) == 0 )
#endif
{
#if CV_FP16
for ( ; x <= size.width - 4; x += 4)
{
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
__m128 v_src = _mm_load_ps(src + x);
__m128 v_src = _mm_loadu_ps(src + x);
__m128i v_dst = _mm_cvtps_ph(v_src, 0);
_mm_storel_epi64((__m128i *)(dst + x), v_dst);
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
float32x4_t v_src = *(float32x4_t*)(src + x);
float32x4_t v_src = vld1q_f32(src + x);
float16x4_t v_dst = vcvt_f16_f32(v_src);
*(float16x4_t*)(dst + x) = v_dst;
vst1_f16((float16_t*)(dst + x), v_dst);
#else
#error "Configuration error"
#endif
@ -4628,7 +4617,9 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
{
int x = 0;
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 && checkHardwareSupport(CV_CPU_FP16) )
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
if ( ( (intptr_t)src & 0xf ) == 0 )
#endif
{
#if CV_FP16
for ( ; x <= size.width - 4; x += 4)
@ -4638,13 +4629,13 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
__m128 v_dst = _mm_cvtph_ps(v_src);
_mm_store_ps((dst + x), v_dst);
_mm_storeu_ps(dst + x, v_dst);
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
float16x4_t v_src = *(float16x4_t*)(src + x);
float16x4_t v_src = vld1_f16((float16_t*)(src + x));
float32x4_t v_dst = vcvt_f32_f16(v_src);
*(float32x4_t*)(dst + x) = v_dst;
vst1q_f32(dst + x, v_dst);
#else
#error "Configuration error"
#endif
@ -4761,7 +4752,7 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
dtype* dst, size_t dstep, Size size, double*) \
{ \
cvtScaleHalf##_<stype,dtype>(src, sstep, dst, dstep, size); \
cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
}
#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
@ -5153,6 +5144,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
ddepth = CV_32F;
break;
default:
CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
return;
}