diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 8e3e170b2..c84f01dcf 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -719,29 +719,133 @@ static const double expTab[] = { 1.9571441241754002690183222516269 * EXPPOLY_32F_A0, 1.9784560263879509682582499181312 * EXPPOLY_32F_A0, }; - + + static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE); static const double exp_postscale = 1./(1 << EXPTAB_SCALE); static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000 static CvStatus CV_STDCALL Exp_32f( const float *_x, float *y, int n ) { - static const double - EXPPOLY_32F_A4 = 1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0, - EXPPOLY_32F_A3 = .6931471805521448196800669615864773144641 / EXPPOLY_32F_A0, - EXPPOLY_32F_A2 = .2402265109513301490103372422686535526573 / EXPPOLY_32F_A0, - EXPPOLY_32F_A1 = .5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0; - - #undef EXPPOLY - #define EXPPOLY(x) \ - (((((x) + EXPPOLY_32F_A1)*(x) + EXPPOLY_32F_A2)*(x) + EXPPOLY_32F_A3)*(x) + EXPPOLY_32F_A4) - + static const float + A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), + A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), + A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0), + A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0); + +#undef EXPPOLY +#define EXPPOLY(x) \ + (((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4) + int i = 0; - DBLINT buf[4]; const Cv32suf* x = (const Cv32suf*)_x; + Cv32suf buf[4]; - buf[0].i.lo = buf[1].i.lo = buf[2].i.lo = buf[3].i.lo = 0; +#if CV_SSE2 + if( n >= 8 && checkHardwareSupport(CV_CPU_SSE) ) + { + static const __m128d prescale2 = _mm_set1_pd(exp_prescale); + static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale); + static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale)); + static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale)); + + static const __m128 mA1 = _mm_set1_ps(A1); + static const __m128 mA2 = _mm_set1_ps(A2); + static const __m128 mA3 = _mm_set1_ps(A3); + static const __m128 mA4 = _mm_set1_ps(A4); + bool y_aligned = (size_t)(void*)y % 16 == 0; + + ushort CV_DECL_ALIGNED(16) tab_idx[8]; + + for( ; i <= n - 8; i += 8 ) + { + __m128 xf0, xf1; + xf0 = _mm_loadu_ps(&x[i].f); + xf1 = _mm_loadu_ps(&x[i+4].f); + __m128i xi0, xi1, xi2, xi3; + + xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4); + xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4); + + __m128d xd0 = _mm_cvtps_pd(xf0); + __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0)); + __m128d xd1 = _mm_cvtps_pd(xf1); + __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1)); + + xd0 = _mm_mul_pd(xd0, prescale2); + xd2 = _mm_mul_pd(xd2, prescale2); + xd1 = _mm_mul_pd(xd1, prescale2); + xd3 = _mm_mul_pd(xd3, prescale2); + + xi0 = _mm_cvtpd_epi32(xd0); + xi2 = _mm_cvtpd_epi32(xd2); + + xi1 = _mm_cvtpd_epi32(xd1); + xi3 = _mm_cvtpd_epi32(xd3); + + xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0)); + xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2)); + xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1)); + xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3)); + + xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2)); + xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3)); + + xf0 = _mm_mul_ps(xf0, postscale4); + xf1 = _mm_mul_ps(xf1, postscale4); + xi0 = _mm_unpacklo_epi64(xi0, xi2); + xi1 = _mm_unpacklo_epi64(xi1, xi3); + xi0 = _mm_packs_epi32(xi0, xi1); + + _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK))); + + xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127)); + xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); + xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255)); + xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128()); + xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); + + __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); + __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); + __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5])); + __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7])); + + __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); + __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3)); + + yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23))); + yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23))); + + __m128 zf0 = _mm_add_ps(xf0, mA1); + __m128 zf1 = _mm_add_ps(xf1, mA1); + + zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2); + zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2); + + zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3); + zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3); + + zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4); + zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4); + + zf0 = _mm_mul_ps(zf0, yf0); + zf1 = _mm_mul_ps(zf1, yf1); + + if( y_aligned ) + { + _mm_store_ps(y + i, zf0); + _mm_store_ps(y + i + 4, zf1); + } + else + { + _mm_storeu_ps(y + i, zf0); + _mm_storeu_ps(y + i + 4, zf1); + } + } + } + else +#endif for( ; i <= n - 4; i += 4 ) { double x0 = x[i].f * exp_prescale; @@ -749,183 +853,252 @@ static CvStatus CV_STDCALL Exp_32f( const float *_x, float *y, int n ) double x2 = x[i + 2].f * exp_prescale; double x3 = x[i + 3].f * exp_prescale; int val0, val1, val2, val3, t; - + if( ((x[i].i >> 23) & 255) > 127 + 10 ) x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; - + if( ((x[i+1].i >> 23) & 255) > 127 + 10 ) x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val; - + if( ((x[i+2].i >> 23) & 255) > 127 + 10 ) x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val; - + if( ((x[i+3].i >> 23) & 255) > 127 + 10 ) x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val; - + val0 = cvRound(x0); val1 = cvRound(x1); val2 = cvRound(x2); val3 = cvRound(x3); - + x0 = (x0 - val0)*exp_postscale; x1 = (x1 - val1)*exp_postscale; x2 = (x2 - val2)*exp_postscale; x3 = (x3 - val3)*exp_postscale; - - t = (val0 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - buf[0].i.hi = t << 20; - - t = (val1 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - buf[1].i.hi = t << 20; - - t = (val2 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - buf[2].i.hi = t << 20; - - t = (val3 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - buf[3].i.hi = t << 20; - - x0 = buf[0].d * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); - x1 = buf[1].d * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); - - y[i] = (float)x0; - y[i + 1] = (float)x1; - - x2 = buf[2].d * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); - x3 = buf[3].d * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); - - y[i + 2] = (float)x2; - y[i + 3] = (float)x3; + + t = (val0 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + buf[0].i = t << 23; + + t = (val1 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + buf[1].i = t << 23; + + t = (val2 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + buf[2].i = t << 23; + + t = (val3 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + buf[3].i = t << 23; + + x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); + x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); + + y[i] = x0; + y[i + 1] = x1; + + x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); + x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); + + y[i + 2] = x2; + y[i + 3] = x3; } - + for( ; i < n; i++ ) { double x0 = x[i].f * exp_prescale; int val0, t; - + if( ((x[i].i >> 23) & 255) > 127 + 10 ) x0 = x[i].i < 0 ? -exp_max_val : exp_max_val; - + val0 = cvRound(x0); - t = (val0 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - - buf[0].i.hi = t << 20; + t = (val0 >> EXPTAB_SCALE) + 127; + t = !(t & ~255) ? t : t < 0 ? 0 : 255; + + buf[0].i = t << 23; x0 = (x0 - val0)*exp_postscale; - - y[i] = (float)(buf[0].d * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0)); + + y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0); } - + return CV_OK; } - + static CvStatus CV_STDCALL Exp_64f( const double *_x, double *y, int n ) { static const double - A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, - A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, - A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0, - A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0, - A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, - A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; - - #undef EXPPOLY - #define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5) - + A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, + A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, + A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0, + A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0, + A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, + A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; + +#undef EXPPOLY +#define EXPPOLY(x) (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5) + int i = 0; - DBLINT buf[4]; + Cv64suf buf[4]; const Cv64suf* x = (const Cv64suf*)_x; - - buf[0].i.lo = buf[1].i.lo = buf[2].i.lo = buf[3].i.lo = 0; - + +#if CV_SSE2 + if( checkHardwareSupport(CV_CPU_SSE) ) + { + static const __m128d prescale2 = _mm_set1_pd(exp_prescale); + static const __m128d postscale2 = _mm_set1_pd(exp_postscale); + static const __m128d maxval2 = _mm_set1_pd(exp_max_val); + static const __m128d minval2 = _mm_set1_pd(-exp_max_val); + + static const __m128d mA0 = _mm_set1_pd(A0); + static const __m128d mA1 = _mm_set1_pd(A1); + static const __m128d mA2 = _mm_set1_pd(A2); + static const __m128d mA3 = _mm_set1_pd(A3); + static const __m128d mA4 = _mm_set1_pd(A4); + static const __m128d mA5 = _mm_set1_pd(A5); + + int CV_DECL_ALIGNED(16) tab_idx[4]; + + for( ; i <= n - 4; i += 4 ) + { + __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f); + __m128i xi0, xi1; + xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2); + xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2); + xf0 = _mm_mul_pd(xf0, prescale2); + xf1 = _mm_mul_pd(xf1, prescale2); + + xi0 = _mm_cvtpd_epi32(xf0); + xi1 = _mm_cvtpd_epi32(xf1); + xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2); + xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2); + + xi0 = _mm_unpacklo_epi64(xi0, xi1); + _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK))); + + xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023)); + xi0 = _mm_packs_epi32(xi0, xi0); + xi0 = _mm_max_epi16(xi0, _mm_setzero_si128()); + xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047)); + xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128()); + xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128()); + xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128()); + + __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1])); + __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3])); + yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52))); + yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52))); + + __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1); + __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1); + + zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2); + zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2); + + zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3); + zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3); + + zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4); + zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4); + + zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5); + zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5); + + zf0 = _mm_mul_pd(zf0, yf0); + zf1 = _mm_mul_pd(zf1, yf1); + + _mm_storeu_pd(y + i, zf0); + _mm_storeu_pd(y + i + 2, zf1); + } + } + else +#endif for( ; i <= n - 4; i += 4 ) { double x0 = x[i].f * exp_prescale; double x1 = x[i + 1].f * exp_prescale; double x2 = x[i + 2].f * exp_prescale; double x3 = x[i + 3].f * exp_prescale; - + double y0, y1, y2, y3; int val0, val1, val2, val3, t; - + t = (int)(x[i].i >> 52); if( (t & 2047) > 1023 + 10 ) x0 = t < 0 ? -exp_max_val : exp_max_val; - + t = (int)(x[i+1].i >> 52); if( (t & 2047) > 1023 + 10 ) x1 = t < 0 ? -exp_max_val : exp_max_val; - + t = (int)(x[i+2].i >> 52); if( (t & 2047) > 1023 + 10 ) x2 = t < 0 ? -exp_max_val : exp_max_val; - + t = (int)(x[i+3].i >> 52); if( (t & 2047) > 1023 + 10 ) x3 = t < 0 ? -exp_max_val : exp_max_val; - + val0 = cvRound(x0); val1 = cvRound(x1); val2 = cvRound(x2); val3 = cvRound(x3); - + x0 = (x0 - val0)*exp_postscale; x1 = (x1 - val1)*exp_postscale; x2 = (x2 - val2)*exp_postscale; x3 = (x3 - val3)*exp_postscale; - + t = (val0 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - buf[0].i.hi = t << 20; - + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + buf[0].i = (int64)t << 52; + t = (val1 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - buf[1].i.hi = t << 20; - + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + buf[1].i = (int64)t << 52; + t = (val2 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - buf[2].i.hi = t << 20; - + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + buf[2].i = (int64)t << 52; + t = (val3 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - buf[3].i.hi = t << 20; - - y0 = buf[0].d * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); - y1 = buf[1].d * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); - + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + buf[3].i = (int64)t << 52; + + y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); + y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 ); + y[i] = y0; y[i + 1] = y1; - - y2 = buf[2].d * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); - y3 = buf[3].d * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); - + + y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 ); + y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 ); + y[i + 2] = y2; y[i + 3] = y3; } - + for( ; i < n; i++ ) { double x0 = x[i].f * exp_prescale; int val0, t; - + t = (int)(x[i].i >> 52); if( (t & 2047) > 1023 + 10 ) x0 = t < 0 ? -exp_max_val : exp_max_val; - + val0 = cvRound(x0); t = (val0 >> EXPTAB_SCALE) + 1023; - t = (t | ((t < 2047) - 1)) & (((t < 0) - 1) & 2047); - - buf[0].i.hi = t << 20; + t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; + + buf[0].i = (int64)t << 52; x0 = (x0 - val0)*exp_postscale; - - y[i] = buf[0].d * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); + + y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 ); } - + return CV_OK; } @@ -968,7 +1141,7 @@ void exp( const Mat& src, Mat& dst ) #define LOGTAB_MASK2 ((1 << (20 - LOGTAB_SCALE)) - 1) #define LOGTAB_MASK2_32F ((1 << (23 - LOGTAB_SCALE)) - 1) -static const double icvLogTab[] = { +static const double CV_DECL_ALIGNED(16) icvLogTab[] = { 0.0000000000000000000000000000000000000000, 1.000000000000000000000000000000000000000, .00389864041565732288852075271279318258166, .9961089494163424124513618677042801556420, .00778214044205494809292034119607706088573, .9922480620155038759689922480620155038760, @@ -1234,26 +1407,75 @@ static const double ln_2 = 0.69314718055994530941723212145818; static CvStatus CV_STDCALL Log_32f( const float *_x, float *y, int n ) { - static const double shift[] = { 0, -1./512 }; - static const double - A0 = 0.3333333333333333333333333, - A1 = -0.5, - A2 = 1; + static const float shift[] = { 0, -1.f/512 }; + static const float + A0 = 0.3333333333333333333333333f, + A1 = -0.5f, + A2 = 1.f; #undef LOGPOLY - #define LOGPOLY(x,k) ((x)+=shift[k],((A0*(x) + A1)*(x) + A2)*(x)) + #define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x)) int i = 0; - union - { - int i; - float f; - } - buf[4]; - + Cv32suf buf[4]; const int* x = (const int*)_x; - for( i = 0; i <= n - 4; i += 4 ) +#if CV_SSE2 + if( checkHardwareSupport(CV_CPU_SSE) ) + { + static const __m128d ln2_2 = _mm_set1_pd(ln_2); + static const __m128 _1_4 = _mm_set1_ps(1.f); + static const __m128 shift4 = _mm_set1_ps(-1.f/512); + + static const __m128 mA0 = _mm_set1_ps(A0); + static const __m128 mA1 = _mm_set1_ps(A1); + static const __m128 mA2 = _mm_set1_ps(A2); + + int CV_DECL_ALIGNED(16) idx[4]; + + for( ; i <= n - 4; i += 4 ) + { + __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); + __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127)); + __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); + __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2); + + __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23)); + + h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2)); + _mm_store_si128((__m128i*)idx, h0); + h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); + + __m128d t0, t1, t2, t3, t4; + t0 = _mm_load_pd(icvLogTab + idx[0]); + t2 = _mm_load_pd(icvLogTab + idx[1]); + t1 = _mm_unpackhi_pd(t0, t2); + t0 = _mm_unpacklo_pd(t0, t2); + t2 = _mm_load_pd(icvLogTab + idx[2]); + t4 = _mm_load_pd(icvLogTab + idx[3]); + t3 = _mm_unpackhi_pd(t2, t4); + t2 = _mm_unpacklo_pd(t2, t4); + + yd0 = _mm_add_pd(yd0, t0); + yd1 = _mm_add_pd(yd1, t2); + + __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1)); + + __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4); + xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3))); + xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4)); + + __m128 zf0 = _mm_mul_ps(xf0, mA0); + zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0); + zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0); + yf0 = _mm_add_ps(yf0, zf0); + + _mm_storeu_ps(y + i, yf0); + } + } + else +#endif + for( ; i <= n - 4; i += 4 ) { double x0, x1, x2, x3; double y0, y1, y2, y3; @@ -1294,14 +1516,18 @@ static CvStatus CV_STDCALL Log_32f( const float *_x, float *y, int n ) x2 = LOGTAB_TRANSLATE( buf[2].f, h2 ); x3 = LOGTAB_TRANSLATE( buf[3].f, h3 ); - y0 += LOGPOLY( x0, h0 == 510 ); - y1 += LOGPOLY( x1, h1 == 510 ); + x0 += shift[h0 == 510]; + x1 += shift[h1 == 510]; + y0 += LOGPOLY( x0 ); + y1 += LOGPOLY( x1 ); y[i] = (float) y0; y[i + 1] = (float) y1; - y2 += LOGPOLY( x2, h2 == 510 ); - y3 += LOGPOLY( x3, h3 == 510 ); + x2 += shift[h2 == 510]; + x3 += shift[h3 == 510]; + y2 += LOGPOLY( x2 ); + y3 += LOGPOLY( x3 ); y[i + 2] = (float) y2; y[i + 3] = (float) y3; @@ -1310,7 +1536,8 @@ static CvStatus CV_STDCALL Log_32f( const float *_x, float *y, int n ) for( ; i < n; i++ ) { int h0 = x[i]; - double x0, y0; + double y0; + float x0; y0 = (((h0 >> 23) & 0xff) - 127) * ln_2; @@ -1319,7 +1546,8 @@ static CvStatus CV_STDCALL Log_32f( const float *_x, float *y, int n ) y0 += icvLogTab[h0]; x0 = LOGTAB_TRANSLATE( buf[0].f, h0 ); - y0 += LOGPOLY( x0, h0 == 510 ); + x0 += shift[h0 == 510]; + y0 += LOGPOLY( x0 ); y[i] = (float)y0; } @@ -1350,6 +1578,91 @@ static CvStatus CV_STDCALL Log_64f( const double *x, double *y, int n ) DBLINT buf[4]; DBLINT *X = (DBLINT *) x; +#if CV_SSE2 + if( checkHardwareSupport(CV_CPU_SSE) ) + { + static const __m128d ln2_2 = _mm_set1_pd(ln_2); + static const __m128d _1_2 = _mm_set1_pd(1.); + static const __m128d shift2 = _mm_set1_pd(-1./512); + + static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff); + static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0); + + static const __m128d mA0 = _mm_set1_pd(A0); + static const __m128d mA1 = _mm_set1_pd(A1); + static const __m128d mA2 = _mm_set1_pd(A2); + static const __m128d mA3 = _mm_set1_pd(A3); + static const __m128d mA4 = _mm_set1_pd(A4); + static const __m128d mA5 = _mm_set1_pd(A5); + static const __m128d mA6 = _mm_set1_pd(A6); + static const __m128d mA7 = _mm_set1_pd(A7); + + int CV_DECL_ALIGNED(16) idx[4]; + + for( ; i <= n - 4; i += 4 ) + { + __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i)); + __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2)); + + __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2)); + __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2)); + + h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1)); + + __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20), + _mm_set1_epi32(2047)), _mm_set1_epi32(1023)); + __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2); + __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2); + + h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2)); + _mm_store_si128((__m128i*)idx, h0); + h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510)); + + __m128d t0, t1, t2, t3, t4; + t0 = _mm_load_pd(icvLogTab + idx[0]); + t2 = _mm_load_pd(icvLogTab + idx[1]); + t1 = _mm_unpackhi_pd(t0, t2); + t0 = _mm_unpacklo_pd(t0, t2); + t2 = _mm_load_pd(icvLogTab + idx[2]); + t4 = _mm_load_pd(icvLogTab + idx[3]); + t3 = _mm_unpackhi_pd(t2, t4); + t2 = _mm_unpacklo_pd(t2, t4); + + yd0 = _mm_add_pd(yd0, t0); + yd1 = _mm_add_pd(yd1, t2); + + xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1); + xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3); + + xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2)); + xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2)); + + __m128d zd0 = _mm_mul_pd(xd0, mA0); + __m128d zd1 = _mm_mul_pd(xd1, mA0); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1); + zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0); + zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1); + + yd0 = _mm_add_pd(yd0, zd0); + yd1 = _mm_add_pd(yd1, zd1); + + _mm_storeu_pd(y + i, yd0); + _mm_storeu_pd(y + i + 2, yd1); + } + } + else +#endif for( ; i <= n - 4; i += 4 ) { double xq; @@ -1414,7 +1727,7 @@ static CvStatus CV_STDCALL Log_64f( const double *x, double *y, int n ) y[i + 2] = y2; y[i + 3] = y3; } - + for( ; i < n; i++ ) { int h0 = X[i].i.hi;