Merge pull request #7033 from tomoaki0705:brushUpFp16
This commit is contained in:
commit
b2698f24b0
@ -526,8 +526,9 @@ CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
|
|||||||
|
|
||||||
/** @brief Converts an array to half precision floating number.
|
/** @brief Converts an array to half precision floating number.
|
||||||
|
|
||||||
convertFp16 converts FP32 to FP16 or FP16 to FP32. The input array has to have type of CV_32F or
|
This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point). The input array has to have type of CV_32F or
|
||||||
CV_16S to represent the bit depth. If the input array is neither of them, it'll do nothing.
|
CV_16S to represent the bit depth. If the input array is neither of them, the function will raise an error.
|
||||||
|
The format of half precision floating point is defined in IEEE 754-2008.
|
||||||
|
|
||||||
@param src input array.
|
@param src input array.
|
||||||
@param dst output array.
|
@param dst output array.
|
||||||
|
@ -4547,20 +4547,7 @@ static short convertFp16SW(float fp32)
|
|||||||
|
|
||||||
// template for FP16 HW conversion function
|
// template for FP16 HW conversion function
|
||||||
template<typename T, typename DT> static void
|
template<typename T, typename DT> static void
|
||||||
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size)
|
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
|
||||||
{
|
|
||||||
sstep /= sizeof(src[0]);
|
|
||||||
dstep /= sizeof(dst[0]);
|
|
||||||
|
|
||||||
for( ; size.height--; src += sstep, dst += dstep )
|
|
||||||
{
|
|
||||||
int x = 0;
|
|
||||||
|
|
||||||
for ( ; x < size.width; x++ )
|
|
||||||
{
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<> void
|
template<> void
|
||||||
cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
|
cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
|
||||||
@ -4574,23 +4561,25 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
|
|||||||
{
|
{
|
||||||
int x = 0;
|
int x = 0;
|
||||||
|
|
||||||
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 )
|
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
|
||||||
|
if ( ( (intptr_t)dst & 0xf ) == 0 )
|
||||||
|
#endif
|
||||||
{
|
{
|
||||||
#if CV_FP16
|
#if CV_FP16
|
||||||
for ( ; x <= size.width - 4; x += 4)
|
for ( ; x <= size.width - 4; x += 4)
|
||||||
{
|
{
|
||||||
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
|
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
|
||||||
__m128 v_src = _mm_load_ps(src + x);
|
__m128 v_src = _mm_loadu_ps(src + x);
|
||||||
|
|
||||||
__m128i v_dst = _mm_cvtps_ph(v_src, 0);
|
__m128i v_dst = _mm_cvtps_ph(v_src, 0);
|
||||||
|
|
||||||
_mm_storel_epi64((__m128i *)(dst + x), v_dst);
|
_mm_storel_epi64((__m128i *)(dst + x), v_dst);
|
||||||
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
||||||
float32x4_t v_src = *(float32x4_t*)(src + x);
|
float32x4_t v_src = vld1q_f32(src + x);
|
||||||
|
|
||||||
float16x4_t v_dst = vcvt_f16_f32(v_src);
|
float16x4_t v_dst = vcvt_f16_f32(v_src);
|
||||||
|
|
||||||
*(float16x4_t*)(dst + x) = v_dst;
|
vst1_f16((float16_t*)(dst + x), v_dst);
|
||||||
#else
|
#else
|
||||||
#error "Configuration error"
|
#error "Configuration error"
|
||||||
#endif
|
#endif
|
||||||
@ -4628,7 +4617,9 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
|
|||||||
{
|
{
|
||||||
int x = 0;
|
int x = 0;
|
||||||
|
|
||||||
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 && checkHardwareSupport(CV_CPU_FP16) )
|
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
|
||||||
|
if ( ( (intptr_t)src & 0xf ) == 0 )
|
||||||
|
#endif
|
||||||
{
|
{
|
||||||
#if CV_FP16
|
#if CV_FP16
|
||||||
for ( ; x <= size.width - 4; x += 4)
|
for ( ; x <= size.width - 4; x += 4)
|
||||||
@ -4638,13 +4629,13 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
|
|||||||
|
|
||||||
__m128 v_dst = _mm_cvtph_ps(v_src);
|
__m128 v_dst = _mm_cvtph_ps(v_src);
|
||||||
|
|
||||||
_mm_store_ps((dst + x), v_dst);
|
_mm_storeu_ps(dst + x, v_dst);
|
||||||
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
|
||||||
float16x4_t v_src = *(float16x4_t*)(src + x);
|
float16x4_t v_src = vld1_f16((float16_t*)(src + x));
|
||||||
|
|
||||||
float32x4_t v_dst = vcvt_f32_f16(v_src);
|
float32x4_t v_dst = vcvt_f32_f16(v_src);
|
||||||
|
|
||||||
*(float32x4_t*)(dst + x) = v_dst;
|
vst1q_f32(dst + x, v_dst);
|
||||||
#else
|
#else
|
||||||
#error "Configuration error"
|
#error "Configuration error"
|
||||||
#endif
|
#endif
|
||||||
@ -4761,7 +4752,7 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
|
|||||||
static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
|
static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
|
||||||
dtype* dst, size_t dstep, Size size, double*) \
|
dtype* dst, size_t dstep, Size size, double*) \
|
||||||
{ \
|
{ \
|
||||||
cvtScaleHalf##_<stype,dtype>(src, sstep, dst, dstep, size); \
|
cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
|
#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
|
||||||
@ -5153,6 +5144,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
|
|||||||
ddepth = CV_32F;
|
ddepth = CV_32F;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user