#1365 Fixed numerous bugs in Bayer2RGB_VNG_8u SSE optimization, added simple regression test

This commit is contained in:
Andrey Kamaev 2012-03-28 20:20:29 +00:00
parent 32b9bc1d00
commit 09490188b3
2 changed files with 191 additions and 153 deletions

View File

@ -2234,7 +2234,7 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
bool greenCell = greenCell0;
i = 2;
#if CV_SSE2
#if CV_SSE2
int limit = !haveSSE ? N-2 : greenCell ? std::min(3, N-2) : 2;
#else
int limit = N - 2;
@ -2401,202 +2401,218 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
dstrow[blueIdx^2] = CV_CAST_8U(R);
greenCell = !greenCell;
}
#if CV_SSE2
if( !haveSSE )
break;
__m128i emask = _mm_set1_epi32(0x0000ffff),
omask = _mm_set1_epi32(0xffff0000),
all_ones = _mm_set1_epi16(1),
z = _mm_setzero_si128();
__m128 _0_5 = _mm_set1_ps(0.5f);
__m128i emask = _mm_set1_epi32(0x0000ffff),
omask = _mm_set1_epi32(0xffff0000),
z = _mm_setzero_si128();
__m128 _0_5 = _mm_set1_ps(0.5f);
#define _mm_merge_epi16(a, b) \
_mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask))
#define _mm_cvtloepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16))
#define _mm_cvthiepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16))
#define _mm_merge_epi16(a, b) _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask)) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
#define _mm_cvtloepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16)) //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
#define _mm_cvthiepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16)) //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
#define _mm_loadl_u8_s16(ptr, offset) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)((ptr) + (offset))), z) //load 8 uchars to 8 shorts
// process 8 pixels at once
for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
{
__m128i gradN, gradS, gradW, gradE, gradNE, gradSW, gradNW, gradSE;
gradN = _mm_adds_epu16(_mm_loadu_si128((__m128i*)brow0),
_mm_loadu_si128((__m128i*)brow1));
gradS = _mm_adds_epu16(_mm_loadu_si128((__m128i*)brow1),
_mm_loadu_si128((__m128i*)brow2));
gradW = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N-1)),
_mm_loadu_si128((__m128i*)(brow1+N)));
gradE = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N+1)),
_mm_loadu_si128((__m128i*)(brow1+N)));
//int gradN = brow0[0] + brow1[0];
__m128i gradN = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow0), _mm_loadu_si128((__m128i*)brow1));
//int gradS = brow1[0] + brow2[0];
__m128i gradS = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow1), _mm_loadu_si128((__m128i*)brow2));
//int gradW = brow1[N-1] + brow1[N];
__m128i gradW = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N-1)), _mm_loadu_si128((__m128i*)(brow1+N)));
//int gradE = brow1[N+1] + brow1[N];
__m128i gradE = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N+1)), _mm_loadu_si128((__m128i*)(brow1+N)));
__m128i minGrad, maxGrad, T;
minGrad = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(gradN, gradS), gradW), gradE);
maxGrad = _mm_max_epi16(_mm_max_epi16(_mm_max_epi16(gradN, gradS), gradW), gradE);
//int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
//int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
__m128i minGrad = _mm_min_epi16(_mm_min_epi16(gradN, gradS), _mm_min_epi16(gradW, gradE));
__m128i maxGrad = _mm_max_epi16(_mm_max_epi16(gradN, gradS), _mm_max_epi16(gradW, gradE));
__m128i grad0, grad1;
grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N4+1)),
_mm_loadu_si128((__m128i*)(brow1+N4)));
grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N2)),
_mm_loadu_si128((__m128i*)(brow0+N2+1))),
_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)),
_mm_loadu_si128((__m128i*)(brow1+N2+1))));
gradNE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);
//int gradNE = brow0[N4+1] + brow1[N4];
//int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N4+1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
grad1 = _mm_adds_epi16( _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N2)), _mm_loadu_si128((__m128i*)(brow0+N2+1))),
_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2+1))));
__m128i gradNE = _mm_merge_epi16(grad0, grad1);
grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N4-1)),
_mm_loadu_si128((__m128i*)(brow1+N4)));
grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N2)),
_mm_loadu_si128((__m128i*)(brow2+N2-1))),
_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)),
_mm_loadu_si128((__m128i*)(brow1+N2-1))));
gradSW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);
//int gradSW = brow1[N4] + brow2[N4-1];
//int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N4-1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N2)), _mm_loadu_si128((__m128i*)(brow2+N2-1))),
_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2-1))));
__m128i gradSW = _mm_merge_epi16(grad0, grad1);
minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNE), gradSW);
maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNE), gradSW);
//int gradNW = brow0[N5-1] + brow1[N5];
//int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N5-1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N3)), _mm_loadu_si128((__m128i*)(brow0+N3-1))),
_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3-1))));
__m128i gradNW = _mm_merge_epi16(grad0, grad1);
grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N5-1)),
_mm_loadu_si128((__m128i*)(brow1+N5)));
grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N3)),
_mm_loadu_si128((__m128i*)(brow0+N3-1))),
_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)),
_mm_loadu_si128((__m128i*)(brow1+N3-1))));
gradNW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);
grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N5+1)),
_mm_loadu_si128((__m128i*)(brow1+N5)));
grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N3)),
_mm_loadu_si128((__m128i*)(brow2+N3+1))),
_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)),
_mm_loadu_si128((__m128i*)(brow1+N3+1))));
gradSE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);
//int gradSE = brow1[N5] + brow2[N5+1];
//int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N5+1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N3)), _mm_loadu_si128((__m128i*)(brow2+N3+1))),
_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3+1))));
__m128i gradSE = _mm_merge_epi16(grad0, grad1);
minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNW), gradSE);
maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNW), gradSE);
T = _mm_add_epi16(_mm_srli_epi16(maxGrad, 1), minGrad);
__m128i RGs = z, GRs = z, Bs = z, ng = z, mask;
//int T = minGrad + maxGrad/2;
__m128i T = _mm_adds_epi16(_mm_srli_epi16(maxGrad, 1), minGrad);
__m128i RGs = z, GRs = z, Bs = z, ng = z;
__m128i t0, t1, x0, x1, x2, x3, x4, x5, x6, x7, x8,
x9, x10, x11, x12, x13, x14, x15, x16;
__m128i x0 = _mm_loadl_u8_s16(srow, +0 );
__m128i x1 = _mm_loadl_u8_s16(srow, -1 - bstep );
__m128i x2 = _mm_loadl_u8_s16(srow, -1 - bstep*2);
__m128i x3 = _mm_loadl_u8_s16(srow, - bstep );
__m128i x4 = _mm_loadl_u8_s16(srow, +1 - bstep*2);
__m128i x5 = _mm_loadl_u8_s16(srow, +1 - bstep );
__m128i x6 = _mm_loadl_u8_s16(srow, +2 - bstep );
__m128i x7 = _mm_loadl_u8_s16(srow, +1 );
__m128i x8 = _mm_loadl_u8_s16(srow, +2 + bstep );
__m128i x9 = _mm_loadl_u8_s16(srow, +1 + bstep );
__m128i x10 = _mm_loadl_u8_s16(srow, +1 + bstep*2);
__m128i x11 = _mm_loadl_u8_s16(srow, + bstep );
__m128i x12 = _mm_loadl_u8_s16(srow, -1 + bstep*2);
__m128i x13 = _mm_loadl_u8_s16(srow, -1 + bstep );
__m128i x14 = _mm_loadl_u8_s16(srow, -2 + bstep );
__m128i x15 = _mm_loadl_u8_s16(srow, -1 );
__m128i x16 = _mm_loadl_u8_s16(srow, -2 - bstep );
__m128i t0, t1, mask;
x0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)srow), z);
// gradN ***********************************************
mask = _mm_cmpgt_epi16(T, gradN); // mask = T>gradN
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradN)
x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-1)), z);
x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-1)), z);
x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)), z);
x4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2+1)), z);
x5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep+1)), z);
x6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep+2)), z);
x7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)), z);
x8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep+2)), z);
x9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep+1)), z);
x10 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2+1)), z);
x11 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)), z);
x12 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2-1)), z);
x13 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep-1)), z);
x14 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep-2)), z);
x15 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)), z);
x16 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-2)), z);
t0 = _mm_slli_epi16(x3, 1); // srow[-bstep]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2), x0); // srow[-bstep*2] + srow[0]
// gradN
mask = _mm_cmpgt_epi16(T, gradN);
ng = _mm_sub_epi16(ng, mask);
// RGs += (srow[-bstep*2] + srow[0]) * (T>gradN)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
// GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x2,x4)), mask));
// Bs += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x5), t0), mask));
t0 = _mm_slli_epi16(x3, 1);
t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2)), z), x0);
// gradNE **********************************************
mask = _mm_cmpgt_epi16(T, gradNE); // mask = T>gradNE
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradNE)
t0 = _mm_slli_epi16(x5, 1); // srow[-bstep+1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2+2), x0); // srow[-bstep*2+2] + srow[0]
RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));
GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x2,x4)), mask));
Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x5), t0), mask));
// RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
// GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)), _mm_adds_epi16(x4,x7)), mask));
// Bs += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])} * (T>gradNE)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x3,x6)), mask));
// gradNE
mask = _mm_cmpgt_epi16(T, gradNE);
ng = _mm_sub_epi16(ng, mask);
// gradE ***********************************************
mask = _mm_cmpgt_epi16(T, gradE); // mask = T>gradE
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradE)
t0 = _mm_slli_epi16(x5, 1);
t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2+2)), z), x0);
t0 = _mm_slli_epi16(x7, 1); // srow[1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, 2), x0); // srow[2] + srow[0]
// RGs += (srow[2] + srow[0]) * (T>gradE)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
// GRs += (srow[1]*2) * (T>gradE)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
// Bs += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x5,x9), _mm_adds_epi16(x6,x8)), mask));
RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)),
_mm_adds_epu16(x4,x7)), mask));
Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x6)), mask));
// gradSE **********************************************
mask = _mm_cmpgt_epi16(T, gradSE); // mask = T>gradSE
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradSE)
// gradE
mask = _mm_cmpgt_epi16(T, gradE);
ng = _mm_sub_epi16(ng, mask);
t0 = _mm_slli_epi16(x9, 1); // srow[bstep+1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2+2), x0); // srow[bstep*2+2] + srow[0]
// RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
// GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)), _mm_adds_epi16(x7,x10)), mask));
// Bs += {srow[-bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1), _mm_adds_epi16(x8,x11)), mask));
t0 = _mm_slli_epi16(x7, 1);
t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+2)), z), x0);
// gradS ***********************************************
mask = _mm_cmpgt_epi16(T, gradS); // mask = T>gradS
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradS)
RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));
GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask));
Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x5,x9),
_mm_adds_epu16(x6,x8)), mask));
t0 = _mm_slli_epi16(x11, 1); // srow[bstep]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,bstep*2), x0); // srow[bstep*2]+srow[0]
// RGs += (srow[bstep*2]+srow[0]) * (T>gradS)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
// GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x10,x12)), mask));
// Bs += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x9,x13), t0), mask));
// gradSE
mask = _mm_cmpgt_epi16(T, gradSE);
ng = _mm_sub_epi16(ng, mask);
// gradSW **********************************************
mask = _mm_cmpgt_epi16(T, gradSW); // mask = T>gradSW
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradSW)
t0 = _mm_slli_epi16(x9, 1);
t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2+2)), z), x0);
t0 = _mm_slli_epi16(x13, 1); // srow[bstep-1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2-2), x0); // srow[bstep*2-2]+srow[0]
// RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
// GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)), _mm_adds_epi16(x12,x15)), mask));
// Bs += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x11,x14)), mask));
RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)),
_mm_adds_epu16(x7,x10)), mask));
Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x8,x11)), mask));
// gradW ***********************************************
mask = _mm_cmpgt_epi16(T, gradW); // mask = T>gradW
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradW)
// gradS
mask = _mm_cmpgt_epi16(T, gradS);
ng = _mm_sub_epi16(ng, mask);
t0 = _mm_slli_epi16(x15, 1); // srow[-1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -2), x0); // srow[-2]+srow[0]
// RGs += (srow[-2]+srow[0]) * (T>gradW)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
// GRs += (srow[-1]*2) * (T>gradW)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
// Bs += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x13), _mm_adds_epi16(x14,x16)), mask));
t0 = _mm_slli_epi16(x11, 1);
t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2)), z), x0);
RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));
GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x10,x12)), mask));
Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x9,x13), t0), mask));
// gradSW
mask = _mm_cmpgt_epi16(T, gradSW);
ng = _mm_sub_epi16(ng, mask);
t0 = _mm_slli_epi16(x13, 1);
t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2-2)), z), x0);
RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)),
_mm_adds_epu16(x12,x15)), mask));
Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x11,x14)), mask));
// gradW
mask = _mm_cmpgt_epi16(T, gradW);
ng = _mm_sub_epi16(ng, mask);
t0 = _mm_slli_epi16(x15, 1);
t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-2)), z), x0);
RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));
GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask));
Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x13),
_mm_adds_epu16(x14,x16)), mask));
// gradNW
mask = _mm_cmpgt_epi16(T, gradNW);
ng = _mm_max_epi16(_mm_sub_epi16(ng, mask), all_ones);
// gradNW **********************************************
mask = _mm_cmpgt_epi16(T, gradNW); // mask = T>gradNW
ng = _mm_sub_epi16(ng, mask); // ng += (T>gradNW)
t0 = _mm_slli_epi16(x1, 1); // srow[-bstep-1]*2
t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,-bstep*2-2), x0); // srow[-bstep*2-2]+srow[0]
// RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW)
RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
// GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW)
GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)), _mm_adds_epi16(x2,x15)), mask));
// Bs += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW)
Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1),_mm_adds_epi16(x3,x16)), mask));
__m128 ngf0, ngf1;
ngf0 = _mm_div_ps(_0_5, _mm_cvtloepi16_ps(ng));
ngf1 = _mm_div_ps(_0_5, _mm_cvthiepi16_ps(ng));
t0 = _mm_slli_epi16(x1, 1);
t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-2)), z), x0);
RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)),
_mm_adds_epu16(x2,x15)), mask));
Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x16)), mask));
// now interpolate r, g & b
t0 = _mm_sub_epi16(GRs, RGs);
t1 = _mm_sub_epi16(Bs, RGs);

View File

@ -1658,6 +1658,7 @@ void CV_ColorBayerTest::prepare_to_validation( int /*test_case_idx*/ )
CV_Error(CV_StsUnsupportedFormat, "");
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(Imgproc_ColorGray, accuracy) { CV_ColorGrayTest test; test.safe_run(); }
@ -1669,3 +1670,24 @@ TEST(Imgproc_ColorLab, accuracy) { CV_ColorLabTest test; test.safe_run(); }
TEST(Imgproc_ColorLuv, accuracy) { CV_ColorLuvTest test; test.safe_run(); }
TEST(Imgproc_ColorRGB, accuracy) { CV_ColorRGBTest test; test.safe_run(); }
TEST(Imgproc_ColorBayer, accuracy) { CV_ColorBayerTest test; test.safe_run(); }
TEST(Imgproc_ColorBayerVNG, accuracy)
{
cvtest::TS& ts = *cvtest::TS::ptr();
Mat given = imread(ts.get_data_path() + "/cvtcolor/bayerVNG_input.png", CV_LOAD_IMAGE_GRAYSCALE);
Mat gold = imread(ts.get_data_path() + "/cvtcolor/bayerVNG_gold.png", CV_LOAD_IMAGE_UNCHANGED);
Mat result;
cvtColor(given, result, CV_BayerBG2BGR_VNG, 3);
EXPECT_EQ(gold.type(), result.type());
EXPECT_EQ(gold.cols, result.cols);
EXPECT_EQ(gold.rows, result.rows);
Mat diff;
absdiff(gold, result, diff);
EXPECT_EQ(0, countNonZero(diff.reshape(1) > 1));
}