SSE2 optimization of cv::preCornerDetect

This commit is contained in:
Ilya Lavrenov 2014-06-30 01:47:51 +04:00
parent 2d81595ed4
commit 654bdde8ed

View File

@ -608,6 +608,11 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
factor *= 255;
factor = 1./(factor * factor * factor);
#if CV_SSE2
volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
__m128 v_factor = _mm_set1_ps((float)factor), v_m2 = _mm_set1_ps(-2.0f);
#endif
Size size = src.size();
int i, j;
for( i = 0; i < size.height; i++ )
@ -619,7 +624,26 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
const float* d2ydata = (const float*)(D2y.data + i*D2y.step);
const float* dxydata = (const float*)(Dxy.data + i*Dxy.step);
for( j = 0; j < size.width; j++ )
j = 0;
#if CV_SSE2
if (haveSSE2)
{
for( ; j <= size.width - 4; j += 4 )
{
__m128 v_dx = _mm_loadu_ps((const float *)(dxdata + j));
__m128 v_dy = _mm_loadu_ps((const float *)(dydata + j));
__m128 v_s1 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dx), _mm_loadu_ps((const float *)(d2ydata + j)));
__m128 v_s2 = _mm_mul_ps(_mm_mul_ps(v_dy, v_dy), _mm_loadu_ps((const float *)(d2xdata + j)));
__m128 v_s3 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dy), _mm_loadu_ps((const float *)(dxydata + j)));
v_s1 = _mm_mul_ps(v_factor, _mm_add_ps(v_s1, _mm_add_ps(v_s2, _mm_mul_ps(v_s3, v_m2))));
_mm_storeu_ps(dstdata + j, v_s1);
}
}
#endif
for( ; j < size.width; j++ )
{
float dx = dxdata[j];
float dy = dydata[j];