diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 7b3fd6ea8..836f3ddbf 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -197,6 +197,420 @@ template struct ColumnSum : public BaseColumnFilter }; +template<> struct ColumnSum : public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + void reset() { sumCount = 0; } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + int i; + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + #if CV_SSE2 + bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #endif + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + for( ; i < width-4; i+=4 ) + { + __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); + __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); + _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); + } + } + #endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int* Sp = (const int*)src[0]; + const int* Sm = (const int*)src[1-ksize]; + uchar* D = (uchar*)dst; + if( haveScale ) + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + const __m128 scale4 = _mm_set1_ps((float)_scale); + for( ; i < width-8; i+=8 ) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); + + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), + _mm_loadu_si128((const __m128i*)(Sp+i+4))); + + __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); + __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01))); + + _s0T = _mm_packs_epi32(_s0T, _s0T1); + + _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T)); + + _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); + } + } + #endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + for( ; i < width-8; i+=8 ) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); + + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), + _mm_loadu_si128((const __m128i*)(Sp+i+4))); + + __m128i _s0T = _mm_packs_epi32(_s0, _s01); + + _mm_storel_epi64((__m128i*)(D+i), _mm_packus_epi16(_s0T, _s0T)); + + _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); + } + } + #endif + + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0); + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } + } + + double scale; + int sumCount; + vector sum; +}; + +template<> struct ColumnSum : public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + void reset() { sumCount = 0; } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + int i; + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + + #if CV_SSE2 + bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #endif + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + for( ; i < width-4; i+=4 ) + { + __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); + __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); + _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp)); + } + } + #endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int* Sp = (const int*)src[0]; + const int* Sm = (const int*)src[1-ksize]; + short* D = (short*)dst; + if( haveScale ) + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + const __m128 scale4 = _mm_set1_ps((float)_scale); + for( ; i < width-8; i+=8 ) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); + + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), + _mm_loadu_si128((const __m128i*)(Sp+i+4))); + + __m128i _s0T = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); + __m128i _s0T1 = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s01))); + + _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0T, _s0T1)); + + _mm_storeu_si128((__m128i*)(SUM+i),_mm_sub_epi32(_s0,_sm)); + _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1)); + } + } + #endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + for( ; i < width-8; i+=8 ) + { + + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4)); + + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + __m128i _s01 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i+4)), + _mm_loadu_si128((const __m128i*)(Sp+i+4))); + + _mm_storeu_si128((__m128i*)(D+i), _mm_packs_epi32(_s0, _s01)); + + _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1)); + } + } + #endif + + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0); + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } + } + + double scale; + int sumCount; + vector sum; +}; + + +template<> struct ColumnSum : public BaseColumnFilter +{ + ColumnSum( int _ksize, int _anchor, double _scale ) + { + ksize = _ksize; + anchor = _anchor; + scale = _scale; + sumCount = 0; + } + + void reset() { sumCount = 0; } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) + { + int i; + int* SUM; + bool haveScale = scale != 1; + double _scale = scale; + #if CV_SSE2 + bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + #endif + + if( width != (int)sum.size() ) + { + sum.resize(width); + sumCount = 0; + } + SUM = &sum[0]; + if( sumCount == 0 ) + { + memset((void*)SUM, 0, width*sizeof(int)); + for( ; sumCount < ksize - 1; sumCount++, src++ ) + { + const int* Sp = (const int*)src[0]; + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + for( ; i < width-4; i+=4 ) + { + __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i)); + __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i)); + _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp)); + } + } + #endif + for( ; i < width; i++ ) + SUM[i] += Sp[i]; + } + } + else + { + CV_Assert( sumCount == ksize-1 ); + src += ksize-1; + } + + for( ; count--; src++ ) + { + const int* Sp = (const int*)src[0]; + const int* Sm = (const int*)src[1-ksize]; + ushort* D = (ushort*)dst; + if( haveScale ) + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + const __m128 scale4 = _mm_set1_ps((float)_scale); + const __m128i delta0 = _mm_set1_epi32(0x8000); + const __m128i delta1 = _mm_set1_epi32(0x80008000); + + for( ; i < width-4; i+=4) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + + __m128i _res = _mm_cvtps_epi32(_mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0))); + + _res = _mm_sub_epi32(_res, delta0); + _res = _mm_add_epi16(_mm_packs_epi16(_res, _res), delta1); + + _mm_storel_epi64((__m128i*)(D+i), _res); + _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + } + } + #endif + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0*_scale); + SUM[i] = s0 - Sm[i]; + } + } + else + { + i = 0; + #if CV_SSE2 + if(haveSSE2) + { + const __m128i delta0 = _mm_set1_epi32(0x8000); + const __m128i delta1 = _mm_set1_epi32(0x80008000); + + for( ; i < width-4; i+=4 ) + { + __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i)); + __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)), + _mm_loadu_si128((const __m128i*)(Sp+i))); + + __m128i _res = _mm_sub_epi32(_s0, delta0); + _res = _mm_add_epi16(_mm_packs_epi16(_res, _res), delta1); + + _mm_storel_epi64((__m128i*)(D+i), _res); + _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm)); + } + } + #endif + + for( ; i < width; i++ ) + { + int s0 = SUM[i] + Sp[i]; + D[i] = saturate_cast(s0); + SUM[i] = s0 - Sm[i]; + } + } + dst += dststep; + } + } + + double scale; + int sumCount; + vector sum; +}; + + } cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor)