From 3456238c1655118e3459d78f9895b4e27d0b990e Mon Sep 17 00:00:00 2001 From: Anatoly Baksheev Date: Wed, 29 Aug 2012 21:25:35 +0400 Subject: [PATCH] minor --- modules/gpu/test/test_denoising.cpp | 4 +- modules/imgproc/src/smooth.cpp | 286 ++++++++++++++-------------- 2 files changed, 144 insertions(+), 146 deletions(-) diff --git a/modules/gpu/test/test_denoising.cpp b/modules/gpu/test/test_denoising.cpp index 0f6cd69c5..f46c87671 100644 --- a/modules/gpu/test/test_denoising.cpp +++ b/modules/gpu/test/test_denoising.cpp @@ -86,7 +86,7 @@ TEST_P(BilateralFilter, Accuracy) EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0); } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, BilateralFilter, testing::Combine( +INSTANTIATE_TEST_CASE_P(GPU_Denoising, BilateralFilter, testing::Combine( ALL_DEVICES, testing::Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(639, 481)), testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3)) @@ -134,7 +134,7 @@ TEST_P(NonLocalMeans, Regression) EXPECT_MAT_NEAR(gray_gold, dgray, 1e-4); } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, NonLocalMeans, ALL_DEVICES); +INSTANTIATE_TEST_CASE_P(GPU_Denoising, NonLocalMeans, ALL_DEVICES); #endif // HAVE_CUDA \ No newline at end of file diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp index 13340511f..7b3fd6ea8 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.cpp @@ -1285,8 +1285,6 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize ) Bilateral Filtering \****************************************************************************************/ -#undef CV_SSE3 - namespace cv { @@ -1305,12 +1303,12 @@ public: { int i, j, cn = dest->channels(), k; Size size = dest->size(); - #if CV_SSE3 + #if CV_SSE3 int CV_DECL_ALIGNED(16) buf[4]; float CV_DECL_ALIGNED(16) bufSum[4]; static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3); - #endif + #endif for( i = range.start; i < range.end; i++ ) { @@ -1323,36 +1321,36 @@ public: { float sum = 0, wsum = 0; int val0 = sptr[j]; - k = 0; - #if CV_SSE3 - if( haveSSE3 ) - { - __m128 _val0 = _mm_set1_ps(static_cast(val0)); - const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); + k = 0; + #if CV_SSE3 + if( haveSSE3 ) + { + __m128 _val0 = _mm_set1_ps(static_cast(val0)); + const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); - for( ; k <= maxk - 4; k += 4 ) - { - __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], + for( ; k <= maxk - 4; k += 4 ) + { + __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]); - - __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0)); - _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val)); + + __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0)); + _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val)); - __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], + __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], color_weight[buf[1]],color_weight[buf[0]]); - __m128 _sw = _mm_loadu_ps(space_weight+k); - __m128 _w = _mm_mul_ps(_cw, _sw); - _cw = _mm_mul_ps(_w, _valF); + __m128 _sw = _mm_loadu_ps(space_weight+k); + __m128 _w = _mm_mul_ps(_cw, _sw); + _cw = _mm_mul_ps(_w, _valF); - _sw = _mm_hadd_ps(_w, _cw); - _sw = _mm_hadd_ps(_sw, _sw); - _mm_storel_pi((__m64*)bufSum, _sw); + _sw = _mm_hadd_ps(_w, _cw); + _sw = _mm_hadd_ps(_sw, _sw); + _mm_storel_pi((__m64*)bufSum, _sw); - sum += bufSum[1]; - wsum += bufSum[0]; - } - } - #endif + sum += bufSum[1]; + wsum += bufSum[0]; + } + } + #endif for( ; k < maxk; k++ ) { int val = sptr[j + space_ofs[k]]; @@ -1371,55 +1369,55 @@ public: { float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; - k = 0; - #if CV_SSE3 - if( haveSSE3 ) - { - const __m128 _b0 = _mm_set1_ps(static_cast(b0)); - const __m128 _g0 = _mm_set1_ps(static_cast(g0)); - const __m128 _r0 = _mm_set1_ps(static_cast(r0)); - const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); + k = 0; + #if CV_SSE3 + if( haveSSE3 ) + { + const __m128 _b0 = _mm_set1_ps(static_cast(b0)); + const __m128 _g0 = _mm_set1_ps(static_cast(g0)); + const __m128 _r0 = _mm_set1_ps(static_cast(r0)); + const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); for( ; k <= maxk - 4; k += 4 ) { - const uchar* sptr_k = sptr + j + space_ofs[k]; - const uchar* sptr_k1 = sptr + j + space_ofs[k+1]; - const uchar* sptr_k2 = sptr + j + space_ofs[k+2]; - const uchar* sptr_k3 = sptr + j + space_ofs[k+3]; + const uchar* sptr_k = sptr + j + space_ofs[k]; + const uchar* sptr_k1 = sptr + j + space_ofs[k+1]; + const uchar* sptr_k2 = sptr + j + space_ofs[k+2]; + const uchar* sptr_k3 = sptr + j + space_ofs[k+3]; - __m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]); - __m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]); - __m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]); + __m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]); + __m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]); + __m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]); - __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0)); - __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0)); - __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0)); + __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0)); + __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0)); + __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0)); - bt =_mm_add_ps(rt, _mm_add_ps(bt, gt)); - _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt)); + bt =_mm_add_ps(rt, _mm_add_ps(bt, gt)); + _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt)); - __m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], + __m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], color_weight[buf[1]],color_weight[buf[0]]); - __m128 _sw = _mm_loadu_ps(space_weight+k); + __m128 _sw = _mm_loadu_ps(space_weight+k); - _w = _mm_mul_ps(_w,_sw); - _b = _mm_mul_ps(_b, _w); - _g = _mm_mul_ps(_g, _w); - _r = _mm_mul_ps(_r, _w); + _w = _mm_mul_ps(_w,_sw); + _b = _mm_mul_ps(_b, _w); + _g = _mm_mul_ps(_g, _w); + _r = _mm_mul_ps(_r, _w); - _w = _mm_hadd_ps(_w, _b); - _g = _mm_hadd_ps(_g, _r); + _w = _mm_hadd_ps(_w, _b); + _g = _mm_hadd_ps(_g, _r); - _w = _mm_hadd_ps(_w, _g); - _mm_store_ps(bufSum, _w); + _w = _mm_hadd_ps(_w, _g); + _mm_store_ps(bufSum, _w); - wsum += bufSum[0]; - sum_b += bufSum[1]; - sum_g += bufSum[2]; - sum_r += bufSum[3]; - } - } - #endif + wsum += bufSum[0]; + sum_b += bufSum[1]; + sum_g += bufSum[2]; + sum_r += bufSum[3]; + } + } + #endif for( ; k < maxk; k++ ) { @@ -1493,8 +1491,8 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, // initialize space-related bilateral filter coefficients for( i = -radius, maxk = 0; i <= radius; i++ ) - { - j = -radius; + { + j = -radius; for( ;j <= radius; j++ ) { @@ -1504,7 +1502,7 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d, space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); space_ofs[maxk++] = (int)(i*temp.step + j*cn); } - } + } BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); parallel_for_(Range(0, size.height), body); @@ -1527,12 +1525,12 @@ public: { int i, j, k; Size size = dest->size(); - #if CV_SSE3 + #if CV_SSE3 int CV_DECL_ALIGNED(16) idxBuf[4]; float CV_DECL_ALIGNED(16) bufSum32[4]; static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3); - #endif + #endif for( i = range.start; i < range.end; i++ ) { @@ -1545,42 +1543,42 @@ public: { float sum = 0, wsum = 0; float val0 = sptr[j]; - k = 0; - #if CV_SSE3 - if( haveSSE3 ) - { - const __m128 _val0 = _mm_set1_ps(sptr[j]); - const __m128 _scale_index = _mm_set1_ps(scale_index); - const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); - - for( ; k <= maxk - 4 ; k += 4 ) - { - __m128 _sw = _mm_loadu_ps(space_weight + k); - __m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], + k = 0; + #if CV_SSE3 + if( haveSSE3 ) + { + const __m128 _val0 = _mm_set1_ps(sptr[j]); + const __m128 _scale_index = _mm_set1_ps(scale_index); + const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); + + for( ; k <= maxk - 4 ; k += 4 ) + { + __m128 _sw = _mm_loadu_ps(space_weight + k); + __m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]); - __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index); + __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index); - __m128i _idx = _mm_cvtps_epi32(_alpha); - _mm_store_si128((__m128i*)idxBuf, _idx); - _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); + __m128i _idx = _mm_cvtps_epi32(_alpha); + _mm_store_si128((__m128i*)idxBuf, _idx); + _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); - __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], + __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]); - __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], + __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]); - - __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); - _val = _mm_mul_ps(_w, _val); + + __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); + _val = _mm_mul_ps(_w, _val); - _sw = _mm_hadd_ps(_w, _val); - _sw = _mm_hadd_ps(_sw, _sw); - _mm_storel_pi((__m64*)bufSum32, _sw); + _sw = _mm_hadd_ps(_w, _val); + _sw = _mm_hadd_ps(_sw, _sw); + _mm_storel_pi((__m64*)bufSum32, _sw); - sum += bufSum32[1]; - wsum += bufSum32[0]; - } - } - #endif + sum += bufSum32[1]; + wsum += bufSum32[0]; + } + } + #endif for( ; k < maxk; k++ ) { @@ -1602,63 +1600,63 @@ public: { float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; - k = 0; - #if CV_SSE3 - if( haveSSE3 ) - { - const __m128 _b0 = _mm_set1_ps(b0); - const __m128 _g0 = _mm_set1_ps(g0); - const __m128 _r0 = _mm_set1_ps(r0); - const __m128 _scale_index = _mm_set1_ps(scale_index); + k = 0; + #if CV_SSE3 + if( haveSSE3 ) + { + const __m128 _b0 = _mm_set1_ps(b0); + const __m128 _g0 = _mm_set1_ps(g0); + const __m128 _r0 = _mm_set1_ps(r0); + const __m128 _scale_index = _mm_set1_ps(scale_index); const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); - - for( ; k <= maxk-4; k += 4 ) - { - __m128 _sw = _mm_loadu_ps(space_weight + k); + + for( ; k <= maxk-4; k += 4 ) + { + __m128 _sw = _mm_loadu_ps(space_weight + k); - const float* sptr_k = sptr + j + space_ofs[k]; - const float* sptr_k1 = sptr + j + space_ofs[k+1]; - const float* sptr_k2 = sptr + j + space_ofs[k+2]; - const float* sptr_k3 = sptr + j + space_ofs[k+3]; + const float* sptr_k = sptr + j + space_ofs[k]; + const float* sptr_k1 = sptr + j + space_ofs[k+1]; + const float* sptr_k2 = sptr + j + space_ofs[k+2]; + const float* sptr_k3 = sptr + j + space_ofs[k+3]; - __m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]); - __m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]); - __m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]); + __m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]); + __m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]); + __m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]); - __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0)); - __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0)); - __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0)); + __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0)); + __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0)); + __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0)); - __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt))); + __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt))); - __m128i _idx = _mm_cvtps_epi32(_alpha); - _mm_store_si128((__m128i*)idxBuf, _idx); - _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); + __m128i _idx = _mm_cvtps_epi32(_alpha); + _mm_store_si128((__m128i*)idxBuf, _idx); + _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); - __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]); - __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]); - - __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); + __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]); + __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]); + + __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); - _b = _mm_mul_ps(_b, _w); - _g = _mm_mul_ps(_g, _w); - _r = _mm_mul_ps(_r, _w); + _b = _mm_mul_ps(_b, _w); + _g = _mm_mul_ps(_g, _w); + _r = _mm_mul_ps(_r, _w); - _w = _mm_hadd_ps(_w, _b); - _g = _mm_hadd_ps(_g, _r); + _w = _mm_hadd_ps(_w, _b); + _g = _mm_hadd_ps(_g, _r); - _w = _mm_hadd_ps(_w, _g); - _mm_store_ps(bufSum32, _w); + _w = _mm_hadd_ps(_w, _g); + _mm_store_ps(bufSum32, _w); - wsum += bufSum32[0]; - sum_b += bufSum32[1]; - sum_g += bufSum32[2]; - sum_r += bufSum32[3]; - } + wsum += bufSum32[0]; + sum_b += bufSum32[1]; + sum_g += bufSum32[2]; + sum_r += bufSum32[3]; + } - } - #endif - + } + #endif + for(; k < maxk; k++ ) { const float* sptr_k = sptr + j + space_ofs[k];