From 5f2135695e9b7d155097bb84abf6c6011ada3652 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH] cvtColor rgb 2 YCrCb --- modules/imgproc/perf/perf_cvt_color.cpp | 168 +----------------- modules/imgproc/src/color.cpp | 225 ++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 159 deletions(-) diff --git a/modules/imgproc/perf/perf_cvt_color.cpp b/modules/imgproc/perf/perf_cvt_color.cpp index 02622ea80..4bcb698ae 100644 --- a/modules/imgproc/perf/perf_cvt_color.cpp +++ b/modules/imgproc/perf/perf_cvt_color.cpp @@ -56,50 +56,10 @@ enum }; CV_ENUM(CvtMode, - COLOR_BGR2BGR555, COLOR_BGR2BGR565, COLOR_BGR2BGRA, COLOR_BGR2GRAY, - COLOR_BGR2HLS, COLOR_BGR2HLS_FULL, COLOR_BGR2HSV, COLOR_BGR2HSV_FULL, - COLOR_BGR2Lab, COLOR_BGR2Luv, COLOR_BGR2RGB, COLOR_BGR2RGBA, COLOR_BGR2XYZ, - COLOR_BGR2YCrCb, COLOR_BGR2YUV, COLOR_BGR5552BGR, COLOR_BGR5552BGRA, - - COLOR_BGR5552GRAY, COLOR_BGR5552RGB, COLOR_BGR5552RGBA, COLOR_BGR5652BGR, - COLOR_BGR5652BGRA, COLOR_BGR5652GRAY, COLOR_BGR5652RGB, COLOR_BGR5652RGBA, - - COLOR_BGRA2BGR, COLOR_BGRA2BGR555, COLOR_BGRA2BGR565, COLOR_BGRA2GRAY, COLOR_BGRA2RGBA, - CX_BGRA2HLS, CX_BGRA2HLS_FULL, CX_BGRA2HSV, CX_BGRA2HSV_FULL, CX_BGRA2Lab, CX_BGRA2Luv, CX_BGRA2XYZ, CX_BGRA2YCrCb, CX_BGRA2YUV, - - COLOR_GRAY2BGR, COLOR_GRAY2BGR555, COLOR_GRAY2BGR565, COLOR_GRAY2BGRA, - - COLOR_HLS2BGR, COLOR_HLS2BGR_FULL, COLOR_HLS2RGB, COLOR_HLS2RGB_FULL, - CX_HLS2BGRA, CX_HLS2BGRA_FULL, CX_HLS2RGBA, CX_HLS2RGBA_FULL, - - COLOR_HSV2BGR, COLOR_HSV2BGR_FULL, COLOR_HSV2RGB, COLOR_HSV2RGB_FULL, - CX_HSV2BGRA, CX_HSV2BGRA_FULL, CX_HSV2RGBA, CX_HSV2RGBA_FULL, - - COLOR_Lab2BGR, COLOR_Lab2LBGR, COLOR_Lab2LRGB, COLOR_Lab2RGB, - CX_Lab2BGRA, CX_Lab2LBGRA, CX_Lab2LRGBA, CX_Lab2RGBA, - - COLOR_LBGR2Lab, COLOR_LBGR2Luv, COLOR_LRGB2Lab, COLOR_LRGB2Luv, - CX_LBGRA2Lab, CX_LBGRA2Luv, CX_LRGBA2Lab, CX_LRGBA2Luv, - - COLOR_Luv2BGR, COLOR_Luv2LBGR, COLOR_Luv2LRGB, COLOR_Luv2RGB, - CX_Luv2BGRA, CX_Luv2LBGRA, CX_Luv2LRGBA, CX_Luv2RGBA, - - COLOR_RGB2BGR555, COLOR_RGB2BGR565, COLOR_RGB2GRAY, - COLOR_RGB2HLS, COLOR_RGB2HLS_FULL, COLOR_RGB2HSV, COLOR_RGB2HSV_FULL, COLOR_RGB2Lab, COLOR_RGB2Luv, COLOR_RGB2XYZ, COLOR_RGB2YCrCb, COLOR_RGB2YUV, - - COLOR_RGBA2BGR, COLOR_RGBA2BGR555, COLOR_RGBA2BGR565, COLOR_RGBA2GRAY, - CX_RGBA2HLS, CX_RGBA2HLS_FULL, CX_RGBA2HSV, CX_RGBA2HSV_FULL, - CX_RGBA2Lab, CX_RGBA2Luv, CX_RGBA2XYZ, - CX_RGBA2YCrCb, CX_RGBA2YUV, - - COLOR_XYZ2BGR, COLOR_XYZ2RGB, CX_XYZ2BGRA, CX_XYZ2RGBA, - - COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, CX_YCrCb2BGRA, CX_YCrCb2RGBA, - COLOR_YUV2BGR, COLOR_YUV2RGB, CX_YUV2BGRA, CX_YUV2RGBA - ) + CX_RGBA2YCrCb, CX_RGBA2YUV) CV_ENUM(CvtModeBayer, @@ -237,135 +197,25 @@ ChPair getConversionInfo(int cvtMode) return ChPair(0,0); } -typedef std::tr1::tuple Size_CvtMode_t; -typedef perf::TestBaseWithParam Size_CvtMode; +typedef perf::TestBaseWithParam Size_CvtMode; PERF_TEST_P(Size_CvtMode, cvtColor8u, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p), - CvtMode::all() - ) + testing::Values(::perf::szODD, ::perf::szVGA, ::perf::sz1080p) ) { - Size sz = get<0>(GetParam()); - int _mode = get<1>(GetParam()), mode = _mode; + Size sz = GetParam(); + int mode = COLOR_RGB2YCrCb; ChPair ch = getConversionInfo(mode); mode %= COLOR_COLORCVT_MAX; - Mat src(sz, CV_8UC(ch.scn)); - Mat dst(sz, CV_8UC(ch.dcn)); + Mat src(sz, CV_8UC(3)); + Mat dst(sz, CV_8UC(3)); declare.time(100); declare.in(src, WARMUP_RNG).out(dst); int runs = sz.width <= 320 ? 100 : 5; - TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); + TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, 3); -#if defined(__APPLE__) && defined(HAVE_IPP) - SANITY_CHECK(dst, _mode == CX_BGRA2HLS_FULL ? 2 : 1); -#else - SANITY_CHECK(dst, 1); -#endif -} - -typedef std::tr1::tuple Size_CvtMode_Bayer_t; -typedef perf::TestBaseWithParam Size_CvtMode_Bayer; - -PERF_TEST_P(Size_CvtMode_Bayer, cvtColorBayer8u, - testing::Combine( - testing::Values(::perf::szODD, ::perf::szVGA), - CvtModeBayer::all() - ) - ) -{ - Size sz = get<0>(GetParam()); - int mode = get<1>(GetParam()); - ChPair ch = getConversionInfo(mode); - mode %= COLOR_COLORCVT_MAX; - - Mat src(sz, CV_8UC(ch.scn)); - Mat dst(sz, CV_8UC(ch.dcn)); - - declare.time(100); - declare.in(src, WARMUP_RNG).out(dst); - - TEST_CYCLE() cvtColor(src, dst, mode, ch.dcn); - - SANITY_CHECK(dst, 1); -} - -typedef std::tr1::tuple Size_CvtMode2_t; -typedef perf::TestBaseWithParam Size_CvtMode2; - -PERF_TEST_P(Size_CvtMode2, cvtColorYUV420, - testing::Combine( - testing::Values(szVGA, sz1080p, Size(130, 60)), - CvtMode2::all() - ) - ) -{ - Size sz = get<0>(GetParam()); - int mode = get<1>(GetParam()); - ChPair ch = getConversionInfo(mode); - - Mat src(sz.height + sz.height / 2, sz.width, CV_8UC(ch.scn)); - Mat dst(sz, CV_8UC(ch.dcn)); - - declare.in(src, WARMUP_RNG).out(dst); - - int runs = (sz.width <= 640) ? 8 : 1; - TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); - - SANITY_CHECK(dst, 1); -} - -typedef std::tr1::tuple Size_CvtMode3_t; -typedef perf::TestBaseWithParam Size_CvtMode3; - -PERF_TEST_P(Size_CvtMode3, cvtColorRGB2YUV420p, - testing::Combine( - testing::Values(szVGA, sz720p, sz1080p, Size(130, 60)), - CvtMode3::all() - ) - ) -{ - Size sz = get<0>(GetParam()); - int mode = get<1>(GetParam()); - ChPair ch = getConversionInfo(mode); - - Mat src(sz, CV_8UC(ch.scn)); - Mat dst(sz.height + sz.height / 2, sz.width, CV_8UC(ch.dcn)); - - declare.time(100); - declare.in(src, WARMUP_RNG).out(dst); - - int runs = (sz.width <= 640) ? 10 : 1; - TEST_CYCLE_MULTIRUN(runs) cvtColor(src, dst, mode, ch.dcn); - - SANITY_CHECK(dst, 1); -} - -CV_ENUM(EdgeAwareBayerMode, COLOR_BayerBG2BGR_EA, COLOR_BayerGB2BGR_EA, COLOR_BayerRG2BGR_EA, COLOR_BayerGR2BGR_EA) - -typedef std::tr1::tuple EdgeAwareParams; -typedef perf::TestBaseWithParam EdgeAwareDemosaicingTest; - -PERF_TEST_P(EdgeAwareDemosaicingTest, demosaicingEA, - testing::Combine( - testing::Values(szVGA, sz720p, sz1080p, Size(130, 60)), - EdgeAwareBayerMode::all() - ) - ) -{ - Size sz = get<0>(GetParam()); - int mode = get<1>(GetParam()); - - Mat src(sz, CV_8UC1); - Mat dst(sz, CV_8UC3); - - declare.in(src, WARMUP_RNG).out(dst); - - TEST_CYCLE() cvtColor(src, dst, mode, 3); - - SANITY_CHECK(dst, 1); + SANITY_CHECK_NOTHING(); } diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index f0a8fd858..55b915b5b 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -102,6 +102,89 @@ static IppStatus sts = ippInit(); #endif +#if CV_SSE2 + +#define _MM_DEINTERLIV_EPI8(layer0_chunk0, layer0_chunk1, layer0_chunk2, \ + layer0_chunk3, layer0_chunk4, layer0_chunk5) \ + { \ + __m128i layer1_chunk0 = _mm_unpacklo_epi8(layer0_chunk0, layer0_chunk3); \ + __m128i layer1_chunk1 = _mm_unpackhi_epi8(layer0_chunk0, layer0_chunk3); \ + __m128i layer1_chunk2 = _mm_unpacklo_epi8(layer0_chunk1, layer0_chunk4); \ + __m128i layer1_chunk3 = _mm_unpackhi_epi8(layer0_chunk1, layer0_chunk4); \ + __m128i layer1_chunk4 = _mm_unpacklo_epi8(layer0_chunk2, layer0_chunk5); \ + __m128i layer1_chunk5 = _mm_unpackhi_epi8(layer0_chunk2, layer0_chunk5); \ + \ + __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); \ + __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); \ + __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); \ + __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); \ + __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); \ + __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); \ + \ + __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); \ + __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); \ + __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); \ + __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); \ + __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); \ + __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); \ + \ + __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); \ + __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); \ + __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); \ + __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); \ + __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); \ + __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); \ + \ + layer0_chunk0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); \ + layer0_chunk1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); \ + layer0_chunk2 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); \ + layer0_chunk3 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); \ + layer0_chunk4 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); \ + layer0_chunk5 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); \ + } + +#define _MM_INTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) \ + { \ + __m128i v_mask = _mm_set1_epi16(0x00ff); \ + \ + __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); \ + __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); \ + __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); \ + __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); \ + __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); \ + __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); \ + \ + __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); \ + __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); \ + __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); \ + __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); \ + __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); \ + __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); \ + \ + __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); \ + __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); \ + __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); \ + __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); \ + __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); \ + __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); \ + \ + __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); \ + __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); \ + __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); \ + __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); \ + __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); \ + __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); \ + \ + v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); \ + v_r1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); \ + v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); \ + v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); \ + v_b0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); \ + v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); \ + } + +#endif + namespace cv { @@ -1699,6 +1782,148 @@ struct RGB2YCrCb_i int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2; }; +#elif CV_SSE2 + +template <> +struct RGB2YCrCb_i +{ + typedef uchar channel_type; + + RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs) + : srccn(_srccn), blueIdx(_blueIdx) + { + static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241}; + memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0])); + if (blueIdx==0) + std::swap(coeffs[0], coeffs[2]); + + v_c0 = _mm_set1_epi32(coeffs[0]); + v_c1 = _mm_set1_epi32(coeffs[1]); + v_c2 = _mm_set1_epi32(coeffs[2]); + v_c3 = _mm_set1_epi32(coeffs[3]); + v_c4 = _mm_set1_epi32(coeffs[4]); + v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1)); + v_delta = _mm_set1_epi32(ColorChannel::half()*(1 << yuv_shift)); + v_delta = _mm_add_epi32(v_delta, v_delta2); + v_zero = _mm_setzero_si128(); + } + + // 16u x 8 + void process(__m128i v_r, __m128i v_g, __m128i v_b, + __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const + { + __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero); + __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero); + __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero); + + __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift); + + __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3); + __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4); + v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift); + v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift); + + v_r_p = _mm_unpackhi_epi16(v_r, v_zero); + v_g_p = _mm_unpackhi_epi16(v_g, v_zero); + v_b_p = _mm_unpackhi_epi16(v_b, v_zero); + + __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0), + _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1), + _mm_mullo_epi32(v_b_p, v_c2))); + v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift); + + __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3); + __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4); + v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift); + v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift); + + v_y = _mm_packs_epi32(v_y0, v_y1); + v_cr = _mm_packs_epi32(v_cr0, v_cr1); + v_cb = _mm_packs_epi32(v_cb0, v_cb1); + } + + void operator()(const uchar * src, uchar * dst, int n) const + { + int scn = srccn, bidx = blueIdx, i = 0; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4]; + int delta = ColorChannel::half()*(1 << yuv_shift); + n *= 3; + + if (scn == 3) + { + for ( ; i <= n - 96; i += 96, src += scn * 32) + { + __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src)); + __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 16)); + __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 32)); + __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 48)); + __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64)); + __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80)); + + _MM_DEINTERLIV_EPI8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1) + + __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero; + process(_mm_unpacklo_epi8(v_r0, v_zero), + _mm_unpacklo_epi8(v_g0, v_zero), + _mm_unpacklo_epi8(v_b0, v_zero), + v_y0, v_cr0, v_cb0); + + __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero; + process(_mm_unpackhi_epi8(v_r0, v_zero), + _mm_unpackhi_epi8(v_g0, v_zero), + _mm_unpackhi_epi8(v_b0, v_zero), + v_y1, v_cr1, v_cb1); + + __m128i v_y_0 = _mm_packus_epi16(v_y0, v_y1); + __m128i v_cr_0 = _mm_packus_epi16(v_cr0, v_cr1); + __m128i v_cb_0 = _mm_packus_epi16(v_cb0, v_cb1); + + process(_mm_unpacklo_epi8(v_r1, v_zero), + _mm_unpacklo_epi8(v_g1, v_zero), + _mm_unpacklo_epi8(v_b1, v_zero), + v_y0, v_cr0, v_cb0); + + process(_mm_unpackhi_epi8(v_r1, v_zero), + _mm_unpackhi_epi8(v_g1, v_zero), + _mm_unpackhi_epi8(v_b1, v_zero), + v_y1, v_cr1, v_cb1); + + __m128i v_y_1 = _mm_packus_epi16(v_y0, v_y1); + __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1); + __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1); + + _MM_INTERLIV_EPI8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1) + + _mm_storeu_si128((__m128i *)(dst + i), v_y_0); + _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1); + _mm_storeu_si128((__m128i *)(dst + i + 32), v_cr_0); + _mm_storeu_si128((__m128i *)(dst + i + 48), v_cr_1); + _mm_storeu_si128((__m128i *)(dst + i + 64), v_cb_0); + _mm_storeu_si128((__m128i *)(dst + i + 80), v_cb_1); + } + } + + for ( ; i < n; i += 3, src += scn) + { + int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift); + int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift); + int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift); + dst[i] = saturate_cast(Y); + dst[i+1] = saturate_cast(Cr); + dst[i+2] = saturate_cast(Cb); + } + } + + int srccn, blueIdx, coeffs[5]; + __m128i v_c0, v_c1, v_c2; + __m128i v_c3, v_c4, v_delta, v_delta2; + __m128i v_zero; +}; + + #endif template struct YCrCb2RGB_f