Merge pull request #7033 from tomoaki0705:brushUpFp16

This commit is contained in:
Alexander Alekhin 2016-08-04 09:31:52 +00:00
commit b2698f24b0
2 changed files with 18 additions and 25 deletions

View File

@ -526,8 +526,9 @@ CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
/** @brief Converts an array to half precision floating number. /** @brief Converts an array to half precision floating number.
convertFp16 converts FP32 to FP16 or FP16 to FP32. The input array has to have type of CV_32F or This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point). The input array has to have type of CV_32F or
CV_16S to represent the bit depth. If the input array is neither of them, it'll do nothing. CV_16S to represent the bit depth. If the input array is neither of them, the function will raise an error.
The format of half precision floating point is defined in IEEE 754-2008.
@param src input array. @param src input array.
@param dst output array. @param dst output array.

View File

@ -4547,20 +4547,7 @@ static short convertFp16SW(float fp32)
// template for FP16 HW conversion function // template for FP16 HW conversion function
template<typename T, typename DT> static void template<typename T, typename DT> static void
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size) cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
{
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
for ( ; x < size.width; x++ )
{
}
}
}
template<> void template<> void
cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size) cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
@ -4574,23 +4561,25 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
{ {
int x = 0; int x = 0;
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 ) #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
if ( ( (intptr_t)dst & 0xf ) == 0 )
#endif
{ {
#if CV_FP16 #if CV_FP16
for ( ; x <= size.width - 4; x += 4) for ( ; x <= size.width - 4; x += 4)
{ {
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386) #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
__m128 v_src = _mm_load_ps(src + x); __m128 v_src = _mm_loadu_ps(src + x);
__m128i v_dst = _mm_cvtps_ph(v_src, 0); __m128i v_dst = _mm_cvtps_ph(v_src, 0);
_mm_storel_epi64((__m128i *)(dst + x), v_dst); _mm_storel_epi64((__m128i *)(dst + x), v_dst);
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
float32x4_t v_src = *(float32x4_t*)(src + x); float32x4_t v_src = vld1q_f32(src + x);
float16x4_t v_dst = vcvt_f16_f32(v_src); float16x4_t v_dst = vcvt_f16_f32(v_src);
*(float16x4_t*)(dst + x) = v_dst; vst1_f16((float16_t*)(dst + x), v_dst);
#else #else
#error "Configuration error" #error "Configuration error"
#endif #endif
@ -4628,7 +4617,9 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
{ {
int x = 0; int x = 0;
if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 && checkHardwareSupport(CV_CPU_FP16) ) #if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
if ( ( (intptr_t)src & 0xf ) == 0 )
#endif
{ {
#if CV_FP16 #if CV_FP16
for ( ; x <= size.width - 4; x += 4) for ( ; x <= size.width - 4; x += 4)
@ -4638,13 +4629,13 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
__m128 v_dst = _mm_cvtph_ps(v_src); __m128 v_dst = _mm_cvtph_ps(v_src);
_mm_store_ps((dst + x), v_dst); _mm_storeu_ps(dst + x, v_dst);
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__) #elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
float16x4_t v_src = *(float16x4_t*)(src + x); float16x4_t v_src = vld1_f16((float16_t*)(src + x));
float32x4_t v_dst = vcvt_f32_f16(v_src); float32x4_t v_dst = vcvt_f32_f16(v_src);
*(float32x4_t*)(dst + x) = v_dst; vst1q_f32(dst + x, v_dst);
#else #else
#error "Configuration error" #error "Configuration error"
#endif #endif
@ -4761,7 +4752,7 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \ static void cvtScaleHalf##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
dtype* dst, size_t dstep, Size size, double*) \ dtype* dst, size_t dstep, Size size, double*) \
{ \ { \
cvtScaleHalf##_<stype,dtype>(src, sstep, dst, dstep, size); \ cvtScaleHalf_<stype,dtype>(src, sstep, dst, dstep, size); \
} }
#define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \ #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
@ -5153,6 +5144,7 @@ void cv::convertFp16( InputArray _src, OutputArray _dst)
ddepth = CV_32F; ddepth = CV_32F;
break; break;
default: default:
CV_Error(Error::StsUnsupportedFormat, "Unsupported input depth");
return; return;
} }