neon
This commit is contained in:
@@ -1541,6 +1541,20 @@ cvtScale_<short, short, float>( const short* src, size_t sstep,
|
||||
_mm_storeu_si128((__m128i*)(dst + x), r0);
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
float32x4_t v_shift = vdupq_n_f32(shift);
|
||||
for(; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
int16x8_t v_src = vld1q_s16(src + x);
|
||||
float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
|
||||
float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
|
||||
|
||||
v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
|
||||
v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
|
||||
|
||||
vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)),
|
||||
vqmovn_s32(cv_vrndq_s32_f32(v_tmp2))));
|
||||
}
|
||||
#endif
|
||||
|
||||
for(; x < size.width; x++ )
|
||||
@@ -1580,6 +1594,20 @@ cvtScale_<short, int, float>( const short* src, size_t sstep,
|
||||
_mm_storeu_si128((__m128i*)(dst + x + 4), r1);
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
float32x4_t v_shift = vdupq_n_f32(shift);
|
||||
for(; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
int16x8_t v_src = vld1q_s16(src + x);
|
||||
float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
|
||||
float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
|
||||
|
||||
v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
|
||||
v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
|
||||
|
||||
vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1));
|
||||
vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2));
|
||||
}
|
||||
#endif
|
||||
|
||||
//We will wait Haswell
|
||||
@@ -2134,6 +2162,14 @@ cvt_<float, short>( const float* src, size_t sstep,
|
||||
_mm_storeu_si128((__m128i*)(dst + x),src1_int128);
|
||||
}
|
||||
}
|
||||
#elif CV_NEON
|
||||
for( ; x <= size.width - 8; x += 8 )
|
||||
{
|
||||
float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4);
|
||||
int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)),
|
||||
vqmovn_s32(cv_vrndq_s32_f32(v_src2)));
|
||||
vst1q_s16(dst + x, v_dst);
|
||||
}
|
||||
#endif
|
||||
for( ; x < size.width; x++ )
|
||||
dst[x] = saturate_cast<short>(src[x]);
|
||||
|
Reference in New Issue
Block a user