This commit is contained in:
Anatoly Baksheev 2012-08-29 21:25:35 +04:00
parent fba081992d
commit 3456238c16
2 changed files with 144 additions and 146 deletions

View File

@ -86,7 +86,7 @@ TEST_P(BilateralFilter, Accuracy)
EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0); EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
} }
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, BilateralFilter, testing::Combine( INSTANTIATE_TEST_CASE_P(GPU_Denoising, BilateralFilter, testing::Combine(
ALL_DEVICES, ALL_DEVICES,
testing::Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(639, 481)), testing::Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(639, 481)),
testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3)) testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_32FC1), MatType(CV_32FC3))
@ -134,7 +134,7 @@ TEST_P(NonLocalMeans, Regression)
EXPECT_MAT_NEAR(gray_gold, dgray, 1e-4); EXPECT_MAT_NEAR(gray_gold, dgray, 1e-4);
} }
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, NonLocalMeans, ALL_DEVICES); INSTANTIATE_TEST_CASE_P(GPU_Denoising, NonLocalMeans, ALL_DEVICES);
#endif // HAVE_CUDA #endif // HAVE_CUDA

View File

@ -1285,8 +1285,6 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
Bilateral Filtering Bilateral Filtering
\****************************************************************************************/ \****************************************************************************************/
#undef CV_SSE3
namespace cv namespace cv
{ {
@ -1305,12 +1303,12 @@ public:
{ {
int i, j, cn = dest->channels(), k; int i, j, cn = dest->channels(), k;
Size size = dest->size(); Size size = dest->size();
#if CV_SSE3 #if CV_SSE3
int CV_DECL_ALIGNED(16) buf[4]; int CV_DECL_ALIGNED(16) buf[4];
float CV_DECL_ALIGNED(16) bufSum[4]; float CV_DECL_ALIGNED(16) bufSum[4];
static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3); bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
#endif #endif
for( i = range.start; i < range.end; i++ ) for( i = range.start; i < range.end; i++ )
{ {
@ -1323,36 +1321,36 @@ public:
{ {
float sum = 0, wsum = 0; float sum = 0, wsum = 0;
int val0 = sptr[j]; int val0 = sptr[j];
k = 0; k = 0;
#if CV_SSE3 #if CV_SSE3
if( haveSSE3 ) if( haveSSE3 )
{ {
__m128 _val0 = _mm_set1_ps(static_cast<float>(val0)); __m128 _val0 = _mm_set1_ps(static_cast<float>(val0));
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
for( ; k <= maxk - 4; k += 4 ) for( ; k <= maxk - 4; k += 4 )
{ {
__m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]); sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
__m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0)); __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0));
_mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val)); _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val));
__m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
color_weight[buf[1]],color_weight[buf[0]]); color_weight[buf[1]],color_weight[buf[0]]);
__m128 _sw = _mm_loadu_ps(space_weight+k); __m128 _sw = _mm_loadu_ps(space_weight+k);
__m128 _w = _mm_mul_ps(_cw, _sw); __m128 _w = _mm_mul_ps(_cw, _sw);
_cw = _mm_mul_ps(_w, _valF); _cw = _mm_mul_ps(_w, _valF);
_sw = _mm_hadd_ps(_w, _cw); _sw = _mm_hadd_ps(_w, _cw);
_sw = _mm_hadd_ps(_sw, _sw); _sw = _mm_hadd_ps(_sw, _sw);
_mm_storel_pi((__m64*)bufSum, _sw); _mm_storel_pi((__m64*)bufSum, _sw);
sum += bufSum[1]; sum += bufSum[1];
wsum += bufSum[0]; wsum += bufSum[0];
} }
} }
#endif #endif
for( ; k < maxk; k++ ) for( ; k < maxk; k++ )
{ {
int val = sptr[j + space_ofs[k]]; int val = sptr[j + space_ofs[k]];
@ -1371,55 +1369,55 @@ public:
{ {
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
k = 0; k = 0;
#if CV_SSE3 #if CV_SSE3
if( haveSSE3 ) if( haveSSE3 )
{ {
const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0)); const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0)); const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0)); const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
for( ; k <= maxk - 4; k += 4 ) for( ; k <= maxk - 4; k += 4 )
{ {
const uchar* sptr_k = sptr + j + space_ofs[k]; const uchar* sptr_k = sptr + j + space_ofs[k];
const uchar* sptr_k1 = sptr + j + space_ofs[k+1]; const uchar* sptr_k1 = sptr + j + space_ofs[k+1];
const uchar* sptr_k2 = sptr + j + space_ofs[k+2]; const uchar* sptr_k2 = sptr + j + space_ofs[k+2];
const uchar* sptr_k3 = sptr + j + space_ofs[k+3]; const uchar* sptr_k3 = sptr + j + space_ofs[k+3];
__m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]); __m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]);
__m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]); __m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]);
__m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]); __m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]);
__m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0)); __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
__m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0)); __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
__m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0)); __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0));
bt =_mm_add_ps(rt, _mm_add_ps(bt, gt)); bt =_mm_add_ps(rt, _mm_add_ps(bt, gt));
_mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt)); _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt));
__m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]], __m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
color_weight[buf[1]],color_weight[buf[0]]); color_weight[buf[1]],color_weight[buf[0]]);
__m128 _sw = _mm_loadu_ps(space_weight+k); __m128 _sw = _mm_loadu_ps(space_weight+k);
_w = _mm_mul_ps(_w,_sw); _w = _mm_mul_ps(_w,_sw);
_b = _mm_mul_ps(_b, _w); _b = _mm_mul_ps(_b, _w);
_g = _mm_mul_ps(_g, _w); _g = _mm_mul_ps(_g, _w);
_r = _mm_mul_ps(_r, _w); _r = _mm_mul_ps(_r, _w);
_w = _mm_hadd_ps(_w, _b); _w = _mm_hadd_ps(_w, _b);
_g = _mm_hadd_ps(_g, _r); _g = _mm_hadd_ps(_g, _r);
_w = _mm_hadd_ps(_w, _g); _w = _mm_hadd_ps(_w, _g);
_mm_store_ps(bufSum, _w); _mm_store_ps(bufSum, _w);
wsum += bufSum[0]; wsum += bufSum[0];
sum_b += bufSum[1]; sum_b += bufSum[1];
sum_g += bufSum[2]; sum_g += bufSum[2];
sum_r += bufSum[3]; sum_r += bufSum[3];
} }
} }
#endif #endif
for( ; k < maxk; k++ ) for( ; k < maxk; k++ )
{ {
@ -1493,8 +1491,8 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
// initialize space-related bilateral filter coefficients // initialize space-related bilateral filter coefficients
for( i = -radius, maxk = 0; i <= radius; i++ ) for( i = -radius, maxk = 0; i <= radius; i++ )
{ {
j = -radius; j = -radius;
for( ;j <= radius; j++ ) for( ;j <= radius; j++ )
{ {
@ -1504,7 +1502,7 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
space_ofs[maxk++] = (int)(i*temp.step + j*cn); space_ofs[maxk++] = (int)(i*temp.step + j*cn);
} }
} }
BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
parallel_for_(Range(0, size.height), body); parallel_for_(Range(0, size.height), body);
@ -1527,12 +1525,12 @@ public:
{ {
int i, j, k; int i, j, k;
Size size = dest->size(); Size size = dest->size();
#if CV_SSE3 #if CV_SSE3
int CV_DECL_ALIGNED(16) idxBuf[4]; int CV_DECL_ALIGNED(16) idxBuf[4];
float CV_DECL_ALIGNED(16) bufSum32[4]; float CV_DECL_ALIGNED(16) bufSum32[4];
static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3); bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
#endif #endif
for( i = range.start; i < range.end; i++ ) for( i = range.start; i < range.end; i++ )
{ {
@ -1545,42 +1543,42 @@ public:
{ {
float sum = 0, wsum = 0; float sum = 0, wsum = 0;
float val0 = sptr[j]; float val0 = sptr[j];
k = 0; k = 0;
#if CV_SSE3 #if CV_SSE3
if( haveSSE3 ) if( haveSSE3 )
{ {
const __m128 _val0 = _mm_set1_ps(sptr[j]); const __m128 _val0 = _mm_set1_ps(sptr[j]);
const __m128 _scale_index = _mm_set1_ps(scale_index); const __m128 _scale_index = _mm_set1_ps(scale_index);
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
for( ; k <= maxk - 4 ; k += 4 ) for( ; k <= maxk - 4 ; k += 4 )
{ {
__m128 _sw = _mm_loadu_ps(space_weight + k); __m128 _sw = _mm_loadu_ps(space_weight + k);
__m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]], __m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]); sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
__m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index); __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index);
__m128i _idx = _mm_cvtps_epi32(_alpha); __m128i _idx = _mm_cvtps_epi32(_alpha);
_mm_store_si128((__m128i*)idxBuf, _idx); _mm_store_si128((__m128i*)idxBuf, _idx);
_alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
__m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]],
expLUT[idxBuf[1]], expLUT[idxBuf[0]]); expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
__m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1],
expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]); expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
__m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
_val = _mm_mul_ps(_w, _val); _val = _mm_mul_ps(_w, _val);
_sw = _mm_hadd_ps(_w, _val); _sw = _mm_hadd_ps(_w, _val);
_sw = _mm_hadd_ps(_sw, _sw); _sw = _mm_hadd_ps(_sw, _sw);
_mm_storel_pi((__m64*)bufSum32, _sw); _mm_storel_pi((__m64*)bufSum32, _sw);
sum += bufSum32[1]; sum += bufSum32[1];
wsum += bufSum32[0]; wsum += bufSum32[0];
} }
} }
#endif #endif
for( ; k < maxk; k++ ) for( ; k < maxk; k++ )
{ {
@ -1602,62 +1600,62 @@ public:
{ {
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
k = 0; k = 0;
#if CV_SSE3 #if CV_SSE3
if( haveSSE3 ) if( haveSSE3 )
{ {
const __m128 _b0 = _mm_set1_ps(b0); const __m128 _b0 = _mm_set1_ps(b0);
const __m128 _g0 = _mm_set1_ps(g0); const __m128 _g0 = _mm_set1_ps(g0);
const __m128 _r0 = _mm_set1_ps(r0); const __m128 _r0 = _mm_set1_ps(r0);
const __m128 _scale_index = _mm_set1_ps(scale_index); const __m128 _scale_index = _mm_set1_ps(scale_index);
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
for( ; k <= maxk-4; k += 4 ) for( ; k <= maxk-4; k += 4 )
{ {
__m128 _sw = _mm_loadu_ps(space_weight + k); __m128 _sw = _mm_loadu_ps(space_weight + k);
const float* sptr_k = sptr + j + space_ofs[k]; const float* sptr_k = sptr + j + space_ofs[k];
const float* sptr_k1 = sptr + j + space_ofs[k+1]; const float* sptr_k1 = sptr + j + space_ofs[k+1];
const float* sptr_k2 = sptr + j + space_ofs[k+2]; const float* sptr_k2 = sptr + j + space_ofs[k+2];
const float* sptr_k3 = sptr + j + space_ofs[k+3]; const float* sptr_k3 = sptr + j + space_ofs[k+3];
__m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]); __m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]);
__m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]); __m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]);
__m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]); __m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]);
__m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0)); __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
__m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0)); __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
__m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0)); __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0));
__m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt))); __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt)));
__m128i _idx = _mm_cvtps_epi32(_alpha); __m128i _idx = _mm_cvtps_epi32(_alpha);
_mm_store_si128((__m128i*)idxBuf, _idx); _mm_store_si128((__m128i*)idxBuf, _idx);
_alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx)); _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
__m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]); __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
__m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]); __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
__m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut)))); __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
_b = _mm_mul_ps(_b, _w); _b = _mm_mul_ps(_b, _w);
_g = _mm_mul_ps(_g, _w); _g = _mm_mul_ps(_g, _w);
_r = _mm_mul_ps(_r, _w); _r = _mm_mul_ps(_r, _w);
_w = _mm_hadd_ps(_w, _b); _w = _mm_hadd_ps(_w, _b);
_g = _mm_hadd_ps(_g, _r); _g = _mm_hadd_ps(_g, _r);
_w = _mm_hadd_ps(_w, _g); _w = _mm_hadd_ps(_w, _g);
_mm_store_ps(bufSum32, _w); _mm_store_ps(bufSum32, _w);
wsum += bufSum32[0]; wsum += bufSum32[0];
sum_b += bufSum32[1]; sum_b += bufSum32[1];
sum_g += bufSum32[2]; sum_g += bufSum32[2];
sum_r += bufSum32[3]; sum_r += bufSum32[3];
} }
} }
#endif #endif
for(; k < maxk; k++ ) for(; k < maxk; k++ )
{ {