brush up convertFp16
* raise an error when wrong bit depth passed * raise an build error when wrong depth is specified for cvtScaleHalf_ * remove unnecessary safe check in cvtScaleHalf_ * use intrinsic instead of direct pointer access * update the explanation
This commit is contained in:
parent
da2810918c
commit
87ca607fd4
@ -526,8 +526,9 @@ CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
|
||||
|
||||
/** @brief Converts an array to half precision floating number.
|
||||
|
||||
convertFp16 converts FP32 to FP16 or FP16 to FP32. The input array has to have type of CV_32F or
|
||||
CV_16S to represent the bit depth. If the input array is neither of them, it'll do nothing.
|
||||
This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point). The input array has to have type of CV_32F or
|
||||
CV_16S to represent the bit depth. If the input array is neither of them, the function will raise an error.
|
||||
The format of half precision floating point is defined in IEEE 754-2008.
|
||||
|
||||
@param src input array.
|
||||
@param dst output array.
|
||||
|
@ -4547,20 +4547,7 @@ static short convertFp16SW(float fp32)
|
||||
|
||||
// template for FP16 HW conversion function
|
||||
template<typename T, typename DT> static void
|
||||
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size)
|
||||
{
|
||||
sstep /= sizeof(src[0]);
|
||||
dstep /= sizeof(dst[0]);
|
||||
|
||||
for( ; size.height--; src += sstep, dst += dstep )
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
for ( ; x < size.width; x++ )
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
|
||||
|
||||
template<> void
|
||||
cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
|
||||
@ -4574,23 +4561,25 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 )
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
|
||||
if ( ( (intptr_t)dst & 0xf ) == 0 )
|
||||
#endif
|
||||
{
|
||||
#if CV_FP16
|
||||
for ( ; x <= size.width - 4; x += 4)
|
||||
{
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
|
||||
__m128 v_src = _mm_load_ps(src + x);
|
||||
__m128 v_src = _mm_loadu_ps(src + x);
|
||||
|
||||
__m128i v_dst = _mm_cvtps_ph(v_src, 0);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(dst + x), v_dst);
|
||||
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
||||
float32x4_t v_src = *(float32x4_t*)(src + x);
|
||||
float32x4_t v_src = vld1q_f32(src + x);
|
||||
|
||||
float16x4_t v_dst = vcvt_f16_f32(v_src);
|
||||
|
||||
*(float16x4_t*)(dst + x) = v_dst;
|
||||
vst1_f16((float16_t*)(dst + x), v_dst);
|
||||
#else
|
||||
#error "Configuration error"
|
||||
#endif
|
||||
@ -4628,7 +4617,9 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 && checkHardwareSupport(CV_CPU_FP16) )
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
|
||||
if ( ( (intptr_t)src & 0xf ) == 0 )
|
||||
#endif
|
||||
{
|
||||
#if CV_FP16
|
||||
for ( ; x <= size.width - 4; x += 4)
|
||||
@ -4638,13 +4629,13 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
|
||||
|
||||
__m128 v_dst = _mm_cvtph_ps(v_src);
|
||||
|
||||
_mm_store_ps((dst + x), v_dst);
|
||||
_mm_storeu_ps(dst + x, v_dst);
|
||||
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
||||
float16x4_t v_src = *(float16x4_t*)(src + x);
|
||||
float16x4_t v_src = vld1_f16((float16_t*)(src + x));
|
||||
|
||||
float32x4_t v_dst = vcvt_f32_f16(v_src);
|
||||
|
||||
*(float32x4_t*)(dst + x) = v_dst;
|
||||
vst1q_f32(dst + x, v_dst);
|
||||
#else
|
||||
#error "Configuration error"
|
||||
#endif
|
||||
@ -4761,7 +4752,7 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
|
||||
static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
|
||||
dtype* dst, size_t dstep, Size size, double*) \
|
||||
{ \
|
||||
cvtScaleHalf##_<stype,dtype>(src, sstep, dst, dstep, size); \
|
||||
cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
|
||||
}
|
||||
|
||||
#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
|
||||
@ -5153,6 +5144,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
|
||||
ddepth = CV_32F;
|
||||
break;
|
||||
default:
|
||||
CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
|
||||
return;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user