SymmRowSmallVec_32f 1x5
NEON speedup: 2.36x Auto-vect speedup: 2.36x Test kernel: [0.1, 0.2408, 0.3184, 0.2408, 0.1]
This commit is contained in:
parent
ed0ce48179
commit
13c0855114
@ -2932,10 +2932,102 @@ struct SymmColumnVec_32f16s
|
||||
};
|
||||
|
||||
|
||||
struct SymmRowSmallVec_32f
|
||||
{
|
||||
SymmRowSmallVec_32f() {}
|
||||
SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
|
||||
{
|
||||
kernel = _kernel;
|
||||
symmetryType = _symmetryType;
|
||||
}
|
||||
|
||||
int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
|
||||
{
|
||||
//Uncomment the two following lines when runtime support for neon is implemented.
|
||||
// if( !checkHardwareSupport(CV_CPU_NEON) )
|
||||
// return 0;
|
||||
|
||||
int i = 0, _ksize = kernel.rows + kernel.cols - 1;
|
||||
float* dst = (float*)_dst;
|
||||
const float* src = (const float*)_src + (_ksize/2)*cn;
|
||||
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
|
||||
const float* kx = kernel.ptr<float>() + _ksize/2;
|
||||
width *= cn;
|
||||
|
||||
if( symmetrical )
|
||||
{
|
||||
if( _ksize == 1 )
|
||||
return 0;
|
||||
if( _ksize == 3 )
|
||||
{
|
||||
if( kx[0] == 2 && kx[1] == 1 )
|
||||
return 0;
|
||||
else if( kx[0] == -2 && kx[1] == 1 )
|
||||
return 0;
|
||||
else
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
else if( _ksize == 5 )
|
||||
{
|
||||
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
|
||||
return 0;
|
||||
else
|
||||
{
|
||||
float32x2_t k0, k1;
|
||||
k0 = k1 = vdup_n_f32(0);
|
||||
k0 = vld1_lane_f32(kx + 0, k0, 0);
|
||||
k0 = vld1_lane_f32(kx + 1, k0, 1);
|
||||
k1 = vld1_lane_f32(kx + 2, k1, 0);
|
||||
|
||||
for( ; i <= width - 4; i += 4, src += 4 )
|
||||
{
|
||||
float32x4_t x0, x1, x2, x3, x4;
|
||||
x0 = vld1q_f32(src);
|
||||
x1 = vld1q_f32(src - cn);
|
||||
x2 = vld1q_f32(src + cn);
|
||||
x3 = vld1q_f32(src - cn*2);
|
||||
x4 = vld1q_f32(src + cn*2);
|
||||
|
||||
float32x4_t y0;
|
||||
y0 = vmulq_lane_f32(x0, k0, 0);
|
||||
y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
|
||||
y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
|
||||
|
||||
vst1q_f32(dst + i, y0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if( _ksize == 3 )
|
||||
{
|
||||
if( kx[0] == 0 && kx[1] == 1 )
|
||||
return 0;
|
||||
else
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
else if( _ksize == 5 )
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
Mat kernel;
|
||||
int symmetryType;
|
||||
};
|
||||
|
||||
|
||||
typedef RowNoVec RowVec_8u32s;
|
||||
typedef RowNoVec RowVec_16s32f;
|
||||
typedef RowNoVec RowVec_32f;
|
||||
typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
|
||||
typedef ColumnNoVec SymmColumnVec_32f;
|
||||
typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
|
||||
typedef FilterNoVec FilterVec_8u;
|
||||
|
Loading…
Reference in New Issue
Block a user