From d30ce2b9ac354dfbd79bceb4c36117236f684f72 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 24 Sep 2014 16:07:15 +0000
Subject: [PATCH 01/40] canny

---
 modules/imgproc/src/canny.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index fa751c910..bfb37dc23 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -361,6 +361,13 @@ void cv::Canny( InputArray _src, OutputArray _dst,
                         _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
                     }
                 }
+#elif CV_NEON
+                for ( ; j < width - 8; j += 8)
+                {
+                    int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
+                    vst1q_s32(_norm + j, vaddq_s32(vmovl_s16(vget_low_s16(v_dx)), vmovl_s16(vget_low_s16(v_dy))));
+                    vst1q_s32(_norm + j + 4, vaddq_s32(vmovl_s16(vget_high_s16(v_dx)), vmovl_s16(vget_high_s16(v_dy))));
+                }
 #endif
                 for ( ; j < width; ++j)
                     _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
@@ -386,6 +393,18 @@ void cv::Canny( InputArray _src, OutputArray _dst,
                         _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
                     }
                 }
+#elif CV_NEON
+                for ( ; j < width - 8; j += 8)
+                {
+                    int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
+                    int32x4_t v_dxp = vmovl_s16(vget_low_s16(v_dx)), v_dyp = vmovl_s16(vget_low_s16(v_dy));
+                    int32x4_t v_dst = vaddq_s32(vmulq_s32(v_dxp, v_dxp), vmulq_s32(v_dyp, v_dyp));
+                    vst1q_s32(_norm + j, v_dst);
+
+                    v_dxp = vmovl_s16(vget_high_s16(v_dx)), v_dyp = vmovl_s16(vget_high_s16(v_dy));
+                    v_dst = vaddq_s32(vmulq_s32(v_dxp, v_dxp), vmulq_s32(v_dyp, v_dyp));
+                    vst1q_s32(_norm + j, v_dst);
+                }
 #endif
                 for ( ; j < width; ++j)
                     _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];

From 12001a42f9ba8a11a4b10e5cf6b6090df4181a6d Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 24 Sep 2014 22:33:23 +0400
Subject: [PATCH 02/40] corners

---
 modules/imgproc/src/canny.cpp  |  4 ++--
 modules/imgproc/src/corner.cpp | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index bfb37dc23..11365a33f 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -362,7 +362,7 @@ void cv::Canny( InputArray _src, OutputArray _dst,
                     }
                 }
 #elif CV_NEON
-                for ( ; j < width - 8; j += 8)
+                for ( ; j <= width - 8; j += 8)
                 {
                     int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
                     vst1q_s32(_norm + j, vaddq_s32(vmovl_s16(vget_low_s16(v_dx)), vmovl_s16(vget_low_s16(v_dy))));
@@ -394,7 +394,7 @@ void cv::Canny( InputArray _src, OutputArray _dst,
                     }
                 }
 #elif CV_NEON
-                for ( ; j < width - 8; j += 8)
+                for ( ; j <= width - 8; j += 8)
                 {
                     int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
                     int32x4_t v_dxp = vmovl_s16(vget_low_s16(v_dx)), v_dyp = vmovl_s16(vget_low_s16(v_dy));
diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp
index 631878427..01265a5b6 100644
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -146,6 +146,18 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
                 _mm_storeu_ps(dst + j, a);
             }
         }
+    #elif CV_NEON
+        float32x4_t v_k = vdupq_n_f32((float)k));
+
+        for( ; j <= size.width - 4; j += 4 )
+        {
+            float32x4x3_t v_src = vld3q_f32(cov + j + 3);
+            float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2];
+            float32x4_t v_ac_bb = vsubq_f32(vmulq_f32(v_a, v_c), vmulq_f32(v_b, v_b));
+            float32x4_t v_ac = vaddq_f32(v_a, v_c);
+            float32x4_t v_prod = vmulq_f32(v_k, vmulq_f32(v_ac, v_ac));
+            vst1q_f32(dst + j, vsubq_f32(v_ac_bb, v_prod));
+        }
     #endif
 
         for( ; j < size.width; j++ )
@@ -641,6 +653,15 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
                 _mm_storeu_ps(dstdata + j, v_s1);
             }
         }
+#elif CV_NEON
+        for( ; j <= size.width - 4; j += 4 )
+        {
+            float32x4_t v_dx = vld1q_f32(dxdata + j), v_dy = vld1q_f32(dydata + j);
+            float32x4_t v_s1 = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j)));
+            float32x4_t v_s2 = vmulq_f32(v_dy, vmulq_f32(v_dy, vld1q_f32(d2xdata + j)));
+            float32x4_t v_s3 = vmulq_f32(v_dx, vmulq_f32(v_dy, vld1q_f32(dxydata + j)));
+            vst1q_f32(dstdata + j, vaddq_f32(vaddq_f32(v_s1, v_s2), vmulq_n_f32(v_s3, -2.0f)));
+        }
 #endif
 
         for( ; j < size.width; j++ )

From c0b702a994f8debe0e30cf782f82f80cc2bfbc99 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Thu, 25 Sep 2014 00:17:23 +0400
Subject: [PATCH 03/40] cv::resize area 2x

---
 modules/imgproc/src/canny.cpp         | 16 ++++---
 modules/imgproc/src/corner.cpp        | 18 +++----
 modules/imgproc/src/imgwarp.cpp       | 67 ++++++++++++++++++++++++++-
 modules/imgproc/test/test_imgwarp.cpp | 48 +++++++++++++++++++
 4 files changed, 132 insertions(+), 17 deletions(-)

diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index 11365a33f..e9e64c571 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -365,8 +365,10 @@ void cv::Canny( InputArray _src, OutputArray _dst,
                 for ( ; j <= width - 8; j += 8)
                 {
                     int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
-                    vst1q_s32(_norm + j, vaddq_s32(vmovl_s16(vget_low_s16(v_dx)), vmovl_s16(vget_low_s16(v_dy))));
-                    vst1q_s32(_norm + j + 4, vaddq_s32(vmovl_s16(vget_high_s16(v_dx)), vmovl_s16(vget_high_s16(v_dy))));
+                    vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))),
+                                                   vabsq_s32(vmovl_s16(vget_low_s16(v_dy)))));
+                    vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))),
+                                                       vabsq_s32(vmovl_s16(vget_high_s16(v_dy)))));
                 }
 #endif
                 for ( ; j < width; ++j)
@@ -397,13 +399,13 @@ void cv::Canny( InputArray _src, OutputArray _dst,
                 for ( ; j <= width - 8; j += 8)
                 {
                     int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
-                    int32x4_t v_dxp = vmovl_s16(vget_low_s16(v_dx)), v_dyp = vmovl_s16(vget_low_s16(v_dy));
-                    int32x4_t v_dst = vaddq_s32(vmulq_s32(v_dxp, v_dxp), vmulq_s32(v_dyp, v_dyp));
+                    int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy);
+                    int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
                     vst1q_s32(_norm + j, v_dst);
 
-                    v_dxp = vmovl_s16(vget_high_s16(v_dx)), v_dyp = vmovl_s16(vget_high_s16(v_dy));
-                    v_dst = vaddq_s32(vmulq_s32(v_dxp, v_dxp), vmulq_s32(v_dyp, v_dyp));
-                    vst1q_s32(_norm + j, v_dst);
+                    v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy);
+                    v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
+                    vst1q_s32(_norm + j + 4, v_dst);
                 }
 #endif
                 for ( ; j < width; ++j)
diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp
index 01265a5b6..60d14cf0d 100644
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -147,16 +147,15 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
             }
         }
     #elif CV_NEON
-        float32x4_t v_k = vdupq_n_f32((float)k));
+        float32x4_t v_k = vdupq_n_f32((float)k);
 
         for( ; j <= size.width - 4; j += 4 )
         {
             float32x4x3_t v_src = vld3q_f32(cov + j + 3);
             float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2];
-            float32x4_t v_ac_bb = vsubq_f32(vmulq_f32(v_a, v_c), vmulq_f32(v_b, v_b));
+            float32x4_t v_ac_bb = vmlsq_f32(vmulq_f32(v_a, v_c), v_b, v_b);
             float32x4_t v_ac = vaddq_f32(v_a, v_c);
-            float32x4_t v_prod = vmulq_f32(v_k, vmulq_f32(v_ac, v_ac));
-            vst1q_f32(dst + j, vsubq_f32(v_ac_bb, v_prod));
+            vst1q_f32(dst + j, vmlsq_f32(v_ac_bb, v_k, vmulq_f32(v_ac, v_ac)));
         }
     #endif
 
@@ -619,10 +618,11 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
     if( src.depth() == CV_8U )
         factor *= 255;
     factor = 1./(factor * factor * factor);
+    float factor_f = (float)factor;
 
 #if CV_SSE2
     volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
-    __m128 v_factor = _mm_set1_ps((float)factor), v_m2 = _mm_set1_ps(-2.0f);
+    __m128 v_factor = _mm_set1_ps(factor_f), v_m2 = _mm_set1_ps(-2.0f);
 #endif
 
     Size size = src.size();
@@ -657,10 +657,10 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
         for( ; j <= size.width - 4; j += 4 )
         {
             float32x4_t v_dx = vld1q_f32(dxdata + j), v_dy = vld1q_f32(dydata + j);
-            float32x4_t v_s1 = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j)));
-            float32x4_t v_s2 = vmulq_f32(v_dy, vmulq_f32(v_dy, vld1q_f32(d2xdata + j)));
-            float32x4_t v_s3 = vmulq_f32(v_dx, vmulq_f32(v_dy, vld1q_f32(dxydata + j)));
-            vst1q_f32(dstdata + j, vaddq_f32(vaddq_f32(v_s1, v_s2), vmulq_n_f32(v_s3, -2.0f)));
+            float32x4_t v_s = vmulq_f32(v_dx, vmulq_f32(v_dx, vld1q_f32(d2ydata + j)));
+            v_s = vmlaq_f32(v_s, vld1q_f32(d2xdata + j), vmulq_f32(v_dy, v_dy));
+            v_s = vmlaq_f32(v_s, vld1q_f32(dxydata + j), vmulq_n_f32(vmulq_f32(v_dy, v_dx), -2));
+            vst1q_f32(dstdata + j, vmulq_n_f32(v_s, factor_f));
         }
 #endif
 
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index fd5f19172..01fc592e2 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1322,7 +1322,72 @@ struct ResizeAreaFastNoVec
     { return 0; }
 };
 
-#if CV_SSE2
+#if CV_NEON
+
+class ResizeAreaFastVec_SIMD_8u
+{
+public:
+    ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+    }
+
+    int operator() (const uchar* S, uchar* D, int w) const
+    {
+        int dx = 0;
+        const uchar* S0 = S, * S1 = S0 + step;
+
+        uint16x8_t v_2 = vdupq_n_u16(2);
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
+            {
+                uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
+
+                uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
+                v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
+                v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
+
+                uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
+                v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
+                v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
+
+                vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
+            }
+        }
+        else if (cn == 4)
+        {
+            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            {
+                uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
+
+                uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
+                uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
+                uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
+                uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
+
+                uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
+                                           vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
+                uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
+                                           vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
+                uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
+
+                vst1_u8(D, vmovn_u16(v_dst));
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn, step;
+};
+
+typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
+
+#elif CV_SSE2
+
 class ResizeAreaFastVec_SIMD_8u
 {
 public:
diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp
index 3146ff72a..91470cfec 100644
--- a/modules/imgproc/test/test_imgwarp.cpp
+++ b/modules/imgproc/test/test_imgwarp.cpp
@@ -1545,4 +1545,52 @@ TEST(Imgproc_InitUndistortMap, accuracy) { CV_UndistortMapTest test; test.safe_r
 TEST(Imgproc_GetRectSubPix, accuracy) { CV_GetRectSubPixTest test; test.safe_run(); }
 TEST(Imgproc_GetQuadSubPix, accuracy) { CV_GetQuadSubPixTest test; test.safe_run(); }
 
+//////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename WT>
+void resizeArea(const cv::Mat & src, cv::Mat & dst)
+{
+    int cn = src.channels();
+
+    for (int y = 0; y < dst.rows; ++y)
+    {
+        const T * sptr0 = src.ptr<T>(y << 1);
+        const T * sptr1 = src.ptr<T>((y << 1) + 1);
+        T * dptr = dst.ptr<T>(y);
+
+        for (int x = 0; x < dst.cols * cn; x += cn)
+        {
+            int x1 = x << 1;
+
+            for (int c = 0; c < cn; ++c)
+            {
+                WT sum = WT(sptr0[x1 + c]) + WT(sptr0[x1 + c + cn]);
+                sum += WT(sptr1[x1 + c]) + WT(sptr1[x1 + c + cn]) + (WT)(2);
+
+                dptr[x + c] = cv::saturate_cast<T>(sum >> 2);
+            }
+        }
+    }
+}
+
+TEST(Resize, Area_half)
+{
+    int types[] = { CV_8UC1, CV_8UC4 };
+
+    for (int i = 0, size = sizeof(types) / sizeof(types[0]); i < size; ++i)
+    {
+        int type = types[i];
+        cv::Mat src(100, 100, type), dst_actual(50, 50, type), dst_reference(50, 50, type);
+
+        if (CV_MAT_DEPTH(type) == CV_8U)
+            resizeArea<uchar, ushort>(src, dst_reference);
+        else
+            CV_Assert(0);
+
+        cv::resize(src, dst_actual, dst_actual.size(), 0, 0, cv::INTER_AREA);
+
+        ASSERT_EQ(0, cvtest::norm(dst_reference, dst_actual, cv::NORM_INF));
+    }
+}
+
 /* End of file. */

From 4dd614864622342f9e746c91e3cd1c9e16d2fc33 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Thu, 25 Sep 2014 13:55:52 +0000
Subject: [PATCH 04/40] cv::resize 16uc1

---
 modules/imgproc/src/imgwarp.cpp       | 52 ++++++++++++++++++++++++++-
 modules/imgproc/test/test_imgwarp.cpp | 21 ++++++++---
 2 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 01fc592e2..07835e6f9 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -49,6 +49,8 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 
+#include <iostream>
+
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 static IppStatus sts = ippInit();
 #endif
@@ -1384,7 +1386,55 @@ private:
     int cn, step;
 };
 
-typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
+class ResizeAreaFastVec_SIMD_16u
+{
+public:
+    ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
+        cn(_cn), step(_step)
+    {
+    }
+
+    int operator() (const ushort * S, ushort * D, int w) const
+    {
+        int dx = 0;
+        const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
+
+        uint32x4_t v_2 = vdupq_n_u32(2);
+
+        if (cn == 1)
+        {
+            for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            {
+                uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
+
+                uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
+                v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
+                v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
+
+                uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
+                v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
+                v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
+
+                vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
+            }
+        }
+        else if (cn == 4)
+        {
+            for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+            {
+                uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
+                uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
+                                             vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
+                vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
+            }
+        }
+
+        return dx;
+    }
+
+private:
+    int cn, step;
+};
 
 #elif CV_SSE2
 
diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp
index 91470cfec..eb13d3540 100644
--- a/modules/imgproc/test/test_imgwarp.cpp
+++ b/modules/imgproc/test/test_imgwarp.cpp
@@ -1575,15 +1575,26 @@ void resizeArea(const cv::Mat & src, cv::Mat & dst)
 
 TEST(Resize, Area_half)
 {
-    int types[] = { CV_8UC1, CV_8UC4 };
+    const int size = 10;
+    int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4 };
 
-    for (int i = 0, size = sizeof(types) / sizeof(types[0]); i < size; ++i)
+    cv::RNG rng(17);
+
+    for (int i = 0, _size = sizeof(types) / sizeof(types[0]); i < _size; ++i)
     {
-        int type = types[i];
-        cv::Mat src(100, 100, type), dst_actual(50, 50, type), dst_reference(50, 50, type);
+        int type = types[i], depth = CV_MAT_DEPTH(type);
 
-        if (CV_MAT_DEPTH(type) == CV_8U)
+        SCOPED_TRACE(depth);
+
+        cv::Mat src(size, size, type), dst_actual(size >> 1, size >> 1, type),
+            dst_reference(size >> 1, size >> 1, type);
+
+        rng.fill(src, cv::RNG::UNIFORM, 0, 1000, true);
+
+        if (depth == CV_8U)
             resizeArea<uchar, ushort>(src, dst_reference);
+        else if (depth == CV_16U)
+            resizeArea<ushort, int>(src, dst_reference);
         else
             CV_Assert(0);
 

From 6377f5a45803275f0782d2f49cdb8148b6e52bdc Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Fri, 26 Sep 2014 08:59:32 +0000
Subject: [PATCH 05/40] fixed warps accuracy tests

---
 modules/imgproc/test/test_imgwarp_strict.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/modules/imgproc/test/test_imgwarp_strict.cpp b/modules/imgproc/test/test_imgwarp_strict.cpp
index 669c60186..4ec4b8683 100644
--- a/modules/imgproc/test/test_imgwarp_strict.cpp
+++ b/modules/imgproc/test/test_imgwarp_strict.cpp
@@ -725,19 +725,25 @@ void CV_Remap_Test::generate_test_data()
 
         case CV_32FC2:
         {
-            MatIterator_<Vec2f> begin_x = mapx.begin<Vec2f>(), end_x = mapx.end<Vec2f>();
             float fscols = static_cast<float>(std::max(src.cols - 1 + n, 0)),
                     fsrows = static_cast<float>(std::max(src.rows - 1 + n, 0));
-            for ( ; begin_x != end_x; ++begin_x)
+            int width = mapx.cols << 1;
+
+            for (int y = 0; y < mapx.rows; ++y)
             {
-                begin_x[0] = rng.uniform(_n, fscols);
-                begin_x[1] = rng.uniform(_n, fsrows);
+                float * ptr = mapx.ptr<float>(y);
+
+                for (int x = 0; x < width; x += 2)
+                {
+                    ptr[x] = rng.uniform(_n, fscols);
+                    ptr[x + 1] = rng.uniform(_n, fsrows);
+                }
             }
         }
         break;
 
         default:
-            assert(0);
+            CV_Assert(0);
         break;
     }
 }

From b4fc87c3802484b577146c06301086df37e50e7d Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Fri, 26 Sep 2014 09:41:41 +0000
Subject: [PATCH 06/40] cv::remap

---
 modules/imgproc/src/imgwarp.cpp | 54 ++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 07835e6f9..861b7ae3a 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -3580,6 +3580,14 @@ public:
                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
                                 }
                             }
+                        #elif CV_NEON
+                            for( ; x1 <= bcols - 4; x1 += 4 )
+                            {
+                                int32x4_t v_sx = cv_vrndq_s32_f32(vld1q_f32(sX + x1)),
+                                          v_sy = cv_vrndq_s32_f32(vld1q_f32(sY + x1));
+                                int16x4x2_t v_dst = vzip_s16(vqmovn_s32(v_sx), vqmovn_s32(v_sy));
+                                vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
+                            }
                         #endif
 
                             for( ; x1 < bcols; x1++ )
@@ -3604,7 +3612,15 @@ public:
                         bufxy = (*m1)(Rect(x, y, bcols, brows));
 
                         const ushort* sA = m2->ptr<ushort>(y+y1) + x;
-                        for( x1 = 0; x1 < bcols; x1++ )
+                        x1 = 0;
+
+                    #if CV_NEON
+                        uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
+                        for ( ; x1 <= bcols - 8; x1 += 8)
+                            vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
+                    #endif
+
+                        for( ; x1 < bcols; x1++ )
                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
                     }
                     else if( planar_input )
@@ -3649,6 +3665,22 @@ public:
                                 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
                             }
                         }
+                    #elif CV_NEON
+                        float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
+                        int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
+
+                        for( ; x1 <= bcols - 4; x1 += 4 )
+                        {
+                            int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
+                                      v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
+                            int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
+                                                      vandq_s32(v_sy, v_scale2));
+                            vst1_u16(A + x1, vqmovun_s32(v_v));
+
+                            int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
+                                                         vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
+                            vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
+                        }
                     #endif
 
                         for( ; x1 < bcols; x1++ )
@@ -3664,6 +3696,26 @@ public:
                     else
                     {
                         const float* sXY = m1->ptr<float>(y+y1) + x*2;
+                        x1 = 0;
+
+                    #if CV_NEON
+                        float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
+                        int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
+
+                        for( ; x1 <= bcols - 4; x1 += 4 )
+                        {
+                            float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
+                            int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
+                            int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
+                            int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
+                                                      vandq_s32(v_sy, v_scale2));
+                            vst1_u16(A + x1, vqmovun_s32(v_v));
+
+                            int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
+                                                         vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
+                            vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
+                        }
+                    #endif
 
                         for( x1 = 0; x1 < bcols; x1++ )
                         {

From 09fbc78a97ff23b34bb92f858964468d49c573e8 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Fri, 26 Sep 2014 10:55:40 +0000
Subject: [PATCH 07/40] cv::threshold

---
 modules/imgproc/src/thresh.cpp | 152 +++++++++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)

diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index 09a1f6eea..721a596e7 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -264,6 +264,74 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
             }
         }
     }
+#elif CV_NEON
+    uint8x16_t v_thresh = vdupq_n_u8(thresh), v_maxval = vdupq_n_u8(maxval);
+
+    switch( type )
+    {
+    case THRESH_BINARY:
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = _src.ptr() + src_step*i;
+            uchar* dst = _dst.ptr() + dst_step*i;
+
+            for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
+                vst1q_u8(dst + j_scalar, vandq_u8(vcgtq_u8(vld1q_u8(src + j_scalar), v_thresh), v_maxval));
+        }
+        break;
+
+    case THRESH_BINARY_INV:
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = _src.ptr() + src_step*i;
+            uchar* dst = _dst.ptr() + dst_step*i;
+
+            for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
+                vst1q_u8(dst + j_scalar, vandq_u8(vcleq_u8(vld1q_u8(src + j_scalar), v_thresh), v_maxval));
+        }
+        break;
+
+    case THRESH_TRUNC:
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = _src.ptr() + src_step*i;
+            uchar* dst = _dst.ptr() + dst_step*i;
+
+            for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
+                vst1q_u8(dst + j_scalar, vminq_u8(vld1q_u8(src + j_scalar), v_thresh));
+        }
+        break;
+
+    case THRESH_TOZERO:
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = _src.ptr() + src_step*i;
+            uchar* dst = _dst.ptr() + dst_step*i;
+
+            for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
+            {
+                uint8x16_t v_src = vld1q_u8(src + j_scalar), v_mask = vcgtq_u8(v_src, v_thresh);
+                vst1q_u8(dst + j_scalar, vandq_u8(v_mask, v_src));
+            }
+        }
+        break;
+
+    case THRESH_TOZERO_INV:
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = _src.ptr() + src_step*i;
+            uchar* dst = _dst.ptr() + dst_step*i;
+
+            for ( j_scalar = 0; j_scalar <= roi.width - 16; j_scalar += 16)
+            {
+                uint8x16_t v_src = vld1q_u8(src + j_scalar), v_mask = vcleq_u8(v_src, v_thresh);
+                vst1q_u8(dst + j_scalar, vandq_u8(v_mask, v_src));
+            }
+        }
+        break;
+    default:
+        return CV_Error( CV_StsBadArg, "" );
+    }
 #endif
 
     if( j_scalar < roi.width )
@@ -382,6 +450,14 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                 }
             }
+        #elif CV_NEON
+            int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
+
+            for( ; j <= roi.width - 8; j += 8 )
+            {
+                uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh);
+                vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
+            }
         #endif
 
             for( ; j < roi.width; j++ )
@@ -410,6 +486,14 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                 }
             }
+        #elif CV_NEON
+            int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval);
+
+            for( ; j <= roi.width - 8; j += 8 )
+            {
+                uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh);
+                vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval));
+            }
         #endif
 
             for( ; j < roi.width; j++ )
@@ -436,6 +520,11 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                 }
             }
+        #elif CV_NEON
+            int16x8_t v_thresh = vdupq_n_s16(thresh);
+
+            for( ; j <= roi.width - 8; j += 8 )
+                vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh));
         #endif
 
             for( ; j < roi.width; j++ )
@@ -462,6 +551,15 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                 }
             }
+        #elif CV_NEON
+            int16x8_t v_thresh = vdupq_n_s16(thresh);
+
+            for( ; j <= roi.width - 8; j += 8 )
+            {
+                int16x8_t v_src = vld1q_s16(src + j);
+                uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh);
+                vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
+            }
         #endif
 
             for( ; j < roi.width; j++ )
@@ -491,6 +589,15 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
                     _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                 }
             }
+        #elif CV_NEON
+            int16x8_t v_thresh = vdupq_n_s16(thresh);
+
+            for( ; j <= roi.width - 8; j += 8 )
+            {
+                int16x8_t v_src = vld1q_s16(src + j);
+                uint16x8_t v_mask = vcleq_s16(v_src, v_thresh);
+                vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src));
+            }
         #endif
             for( ; j < roi.width; j++ )
             {
@@ -576,6 +683,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                         _mm_storeu_ps( dst + j + 4, v1 );
                     }
                 }
+#elif CV_NEON
+                float32x4_t v_thresh = vdupq_n_f32(thresh);
+                uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
+
+                for( ; j <= roi.width - 4; j += 4 )
+                {
+                    float32x4_t v_src = vld1q_f32(src + j);
+                    uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval);
+                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
+                }
 #endif
 
                 for( ; j < roi.width; j++ )
@@ -604,6 +721,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                         _mm_storeu_ps( dst + j + 4, v1 );
                     }
                 }
+#elif CV_NEON
+                float32x4_t v_thresh = vdupq_n_f32(thresh);
+                uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));
+
+                for( ; j <= roi.width - 4; j += 4 )
+                {
+                    float32x4_t v_src = vld1q_f32(src + j);
+                    uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval);
+                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
+                }
 #endif
 
                 for( ; j < roi.width; j++ )
@@ -630,6 +757,11 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                         _mm_storeu_ps( dst + j + 4, v1 );
                     }
                 }
+#elif CV_NEON
+                float32x4_t v_thresh = vdupq_n_f32(thresh);
+
+                for( ; j <= roi.width - 4; j += 4 )
+                    vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh));
 #endif
 
                 for( ; j < roi.width; j++ )
@@ -656,6 +788,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                         _mm_storeu_ps( dst + j + 4, v1 );
                     }
                 }
+#elif CV_NEON
+                float32x4_t v_thresh = vdupq_n_f32(thresh);
+
+                for( ; j <= roi.width - 4; j += 4 )
+                {
+                    float32x4_t v_src = vld1q_f32(src + j);
+                    uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh),
+                                                 vreinterpretq_u32_f32(v_src));
+                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
+                }
 #endif
 
                 for( ; j < roi.width; j++ )
@@ -685,6 +827,16 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                         _mm_storeu_ps( dst + j + 4, v1 );
                     }
                 }
+#elif CV_NEON
+                float32x4_t v_thresh = vdupq_n_f32(thresh);
+
+                for( ; j <= roi.width - 4; j += 4 )
+                {
+                    float32x4_t v_src = vld1q_f32(src + j);
+                    uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh),
+                                                 vreinterpretq_u32_f32(v_src));
+                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
+                }
 #endif
                 for( ; j < roi.width; j++ )
                 {

From d090fcf2fe2c8127453c2e1b93bc62c3f37151a5 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Fri, 26 Sep 2014 14:39:04 +0000
Subject: [PATCH 08/40] cv::moments (CV_8UC1)

---
 modules/imgproc/src/moments.cpp | 91 ++++++++++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp
index d68248c36..b292d9982 100644
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -203,7 +203,7 @@ static Moments contourMoments( const Mat& contour )
 \****************************************************************************************/
 
 template<typename T, typename WT, typename MT>
-struct MomentsInTile_SSE
+struct MomentsInTile_SIMD
 {
     int operator() (const T *, int, WT &, WT &, WT &, MT &)
     {
@@ -214,9 +214,9 @@ struct MomentsInTile_SSE
 #if CV_SSE2
 
 template <>
-struct MomentsInTile_SSE<uchar, int, int>
+struct MomentsInTile_SIMD<uchar, int, int>
 {
-    MomentsInTile_SSE()
+    MomentsInTile_SIMD()
     {
         useSIMD = checkHardwareSupport(CV_CPU_SSE2);
     }
@@ -234,17 +234,16 @@ struct MomentsInTile_SSE<uchar, int, int>
             for( ; x <= len - 8; x += 8 )
             {
                 __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
-                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
-                __m128i px = _mm_mullo_epi16(p, qx);
                 __m128i sx = _mm_mullo_epi16(qx, qx);
+
+                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
                 qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
                 qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
-                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx));
+                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx));
 
                 qx = _mm_add_epi16(qx, dx);
             }
 
-            int CV_DECL_ALIGNED(16) buf[4];
             _mm_store_si128((__m128i*)buf, qx0);
             x0 = buf[0] + buf[1] + buf[2] + buf[3];
             _mm_store_si128((__m128i*)buf, qx1);
@@ -258,17 +257,84 @@ struct MomentsInTile_SSE<uchar, int, int>
         return x;
     }
 
+    int CV_DECL_ALIGNED(16) buf[4];
     bool useSIMD;
 };
 
+#elif CV_NEON
+
+template <>
+struct MomentsInTile_SIMD<uchar, int, int>
+{
+    MomentsInTile_SIMD()
+    {
+        ushort CV_DECL_ALIGNED(8) init[4] = { 0, 1, 2, 3 };
+        qx_init = vld1_u16(init);
+        v_step = vdup_n_u16(4);
+    }
+
+    int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3)
+    {
+        int x = 0;
+
+        uint32x4_t v_z = vdupq_n_u32(0), v_x0 = v_z, v_x1 = v_z,
+            v_x2 = v_z, v_x3 = v_z;
+        uint16x4_t qx = qx_init;
+
+        for( ; x <= len - 8; x += 8 )
+        {
+            uint16x8_t v_src = vmovl_u8(vld1_u8(ptr + x));
+
+            // first part
+            uint32x4_t v_qx = vmovl_u16(qx);
+            uint16x4_t v_p = vget_low_u16(v_src);
+            uint32x4_t v_px = vmull_u16(qx, v_p);
+
+            v_x0 = vaddw_u16(v_x0, v_p);
+            v_x1 = vaddq_u32(v_x1, v_px);
+            v_px = vmulq_u32(v_px, v_qx);
+            v_x2 = vaddq_u32(v_x2, v_px);
+            v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx));
+            qx = vadd_u16(qx, v_step);
+
+            // second part
+            v_qx = vmovl_u16(qx);
+            v_p = vget_high_u16(v_src);
+            v_px = vmull_u16(qx, v_p);
+
+            v_x0 = vaddw_u16(v_x0, v_p);
+            v_x1 = vaddq_u32(v_x1, v_px);
+            v_px = vmulq_u32(v_px, v_qx);
+            v_x2 = vaddq_u32(v_x2, v_px);
+            v_x3 = vaddq_u32(v_x3, vmulq_u32(v_px, v_qx));
+
+            qx = vadd_u16(qx, v_step);
+        }
+
+        vst1q_u32(buf, v_x0);
+        x0 = buf[0] + buf[1] + buf[2] + buf[3];
+        vst1q_u32(buf, v_x1);
+        x1 = buf[0] + buf[1] + buf[2] + buf[3];
+        vst1q_u32(buf, v_x2);
+        x2 = buf[0] + buf[1] + buf[2] + buf[3];
+        vst1q_u32(buf, v_x3);
+        x3 = buf[0] + buf[1] + buf[2] + buf[3];
+
+        return x;
+    }
+
+    uint CV_DECL_ALIGNED(16) buf[4];
+    uint16x4_t qx_init, v_step;
+};
+
 #endif
 
 #if CV_SSE4_1
 
 template <>
-struct MomentsInTile_SSE<ushort, int, int64>
+struct MomentsInTile_SIMD<ushort, int, int64>
 {
-    MomentsInTile_SSE()
+    MomentsInTile_SIMD()
     {
         useSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
     }
@@ -302,9 +368,6 @@ struct MomentsInTile_SSE<ushort, int, int64>
                 v_ix1 = _mm_add_epi32(v_ix1, v_delta);
             }
 
-            int CV_DECL_ALIGNED(16) buf[4];
-            int64 CV_DECL_ALIGNED(16) buf64[2];
-
             _mm_store_si128((__m128i*)buf, v_x0);
             x0 = buf[0] + buf[1] + buf[2] + buf[3];
             _mm_store_si128((__m128i*)buf, v_x1);
@@ -319,6 +382,8 @@ struct MomentsInTile_SSE<ushort, int, int64>
         return x;
     }
 
+    int CV_DECL_ALIGNED(16) buf[4];
+    int64 CV_DECL_ALIGNED(16) buf64[2];
     bool useSIMD;
 };
 
@@ -334,7 +399,7 @@ static void momentsInTile( const Mat& img, double* moments )
     Size size = img.size();
     int x, y;
     MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
-    MomentsInTile_SSE<T, WT, MT> vop;
+    MomentsInTile_SIMD<T, WT, MT> vop;
 
     for( y = 0; y < size.height; y++ )
     {

From 857a2d5bfd9cad4d57221e69a8f58729373e7192 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sun, 28 Sep 2014 01:11:07 -0700
Subject: [PATCH 09/40] cv::addWeighted

---
 modules/core/src/arithm.cpp | 112 +++++++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 1 deletion(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index fe157f868..a33b9f1b7 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -2367,6 +2367,114 @@ void cv::divide(double scale, InputArray src2,
 namespace cv
 {
 
+template <typename T, typename WT>
+struct AddWeighted_SIMD
+{
+    int operator() (const T *, const T *, T *, int, WT, WT, WT) const
+    {
+        return 0;
+    }
+};
+
+#if CV_NEON
+
+template <>
+struct AddWeighted_SIMD<schar, float>
+{
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        float32x4_t g = vdupq_n_f32 (gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            int8x8_t in1 = vld1_s8(src1 + x);
+            int16x8_t in1_16 = vmovl_s8(in1);
+            float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
+            float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
+
+            int8x8_t in2 = vld1_s8(src2+x);
+            int16x8_t in2_16 = vmovl_s8(in2);
+            float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
+            float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
+
+            float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
+            float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
+            out_f_l = vaddq_f32(out_f_l, g);
+            out_f_h = vaddq_f32(out_f_h, g);
+
+            int16x4_t out_16_l = vqmovn_s32(vcvtq_s32_f32(out_f_l));
+            int16x4_t out_16_h = vqmovn_s32(vcvtq_s32_f32(out_f_h));
+
+            int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
+            int8x8_t out = vqmovn_s16(out_16);
+
+            vst1_s8(dst + x, out);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AddWeighted_SIMD<ushort, float>
+{
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        float32x4_t g = vdupq_n_f32(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
+
+            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
+            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
+            uint16x4_t v_dst1 = vqmovn_u32(vcvtq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+
+            v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
+            v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
+            uint16x4_t v_dst2 = vqmovn_u32(vcvtq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+
+            vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AddWeighted_SIMD<short, float>
+{
+    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        float32x4_t g = vdupq_n_f32(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
+
+            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
+            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
+            int16x4_t v_dst1 = vqmovn_s32(vcvtq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+
+            v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
+            v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
+            int16x4_t v_dst2 = vqmovn_s32(vcvtq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+
+            vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
+        }
+
+        return x;
+    }
+};
+
+#endif
+
 template<typename T, typename WT> static void
 addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
               T* dst, size_t step, Size size, void* _scalars )
@@ -2377,9 +2485,11 @@ addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
     step2 /= sizeof(src2[0]);
     step /= sizeof(dst[0]);
 
+    AddWeighted_SIMD<T, WT> vop;
+
     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
     {
-        int x = 0;
+        int x = vop(src1, src2, dst, size.width, alpha, beta, gamma);
         #if CV_ENABLE_UNROLLED
         for( ; x <= size.width - 4; x += 4 )
         {

From 74e60e44ade08a475c5025b2438da4c28379e717 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sun, 28 Sep 2014 02:41:08 -0700
Subject: [PATCH 10/40] cv::compare

---
 modules/core/src/arithm.cpp | 213 +++++++++++++++++++++++++++++++++++-
 1 file changed, 211 insertions(+), 2 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index a33b9f1b7..630dcacf4 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -2667,6 +2667,213 @@ void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
 namespace cv
 {
 
+template <typename T>
+struct Cmp_SIMD
+{
+    explicit Cmp_SIMD(int)
+    {
+    }
+
+    int operator () (const T *, const T *, uchar *, int) const
+    {
+        return 0;
+    }
+};
+
+#if CV_NEON
+
+template <>
+struct Cmp_SIMD<schar>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        CV_Assert(code == CMP_GT || code == CMP_LE ||
+                  code == CMP_EQ || code == CMP_NE);
+
+        v_mask = vdupq_n_u8(255);
+    }
+
+    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 16; x += 16)
+                vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
+        else if (code == CMP_LE)
+            for ( ; x <= width - 16; x += 16)
+                vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 16; x += 16)
+                vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
+        else if (code == CMP_NE)
+            for ( ; x <= width - 16; x += 16)
+                vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
+
+        return x;
+    }
+
+    int code;
+    uint8x16_t v_mask;
+};
+
+template <>
+struct Cmp_SIMD<ushort>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        CV_Assert(code == CMP_GT || code == CMP_LE ||
+                  code == CMP_EQ || code == CMP_NE);
+
+        v_mask = vdup_n_u8(255);
+    }
+
+    int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
+                vst1_u8(dst + x, vmovn_u16(v_dst));
+            }
+        else if (code == CMP_LE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
+                vst1_u8(dst + x, vmovn_u16(v_dst));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
+                vst1_u8(dst + x, vmovn_u16(v_dst));
+            }
+        else if (code == CMP_NE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
+                vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
+            }
+
+        return x;
+    }
+
+    int code;
+    uint8x8_t v_mask;
+};
+
+template <>
+struct Cmp_SIMD<int>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        CV_Assert(code == CMP_GT || code == CMP_LE ||
+                  code == CMP_EQ || code == CMP_NE);
+
+        v_mask = vdup_n_u8(255);
+    }
+
+    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
+                uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_LE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
+                uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
+                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_NE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
+                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
+                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
+                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
+            }
+
+        return x;
+    }
+
+    int code;
+    uint8x8_t v_mask;
+};
+
+template <>
+struct Cmp_SIMD<float>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        CV_Assert(code == CMP_GT || code == CMP_LE ||
+                  code == CMP_EQ || code == CMP_NE);
+
+        v_mask = vdup_n_u8(255);
+    }
+
+    int operator () (const float * src1, const float * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_LE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_NE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
+                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
+            }
+
+        return x;
+    }
+
+    int code;
+    uint8x8_t v_mask;
+};
+
+#endif
+
 template<typename T> static void
 cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
      uchar* dst, size_t step, Size size, int code)
@@ -2680,12 +2887,14 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
         code = code == CMP_GE ? CMP_LE : CMP_GT;
     }
 
+    Cmp_SIMD<T> vop(code);
+
     if( code == CMP_GT || code == CMP_LE )
     {
         int m = code == CMP_GT ? 0 : 255;
         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
         {
-            int x = 0;
+            int x = vop(src1, src2, dst, size.width);
             #if CV_ENABLE_UNROLLED
             for( ; x <= size.width - 4; x += 4 )
             {
@@ -2700,7 +2909,7 @@ cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
             #endif
             for( ; x < size.width; x++ )
                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-               }
+        }
     }
     else if( code == CMP_EQ || code == CMP_NE )
     {

From e46332a183e2f28d36b3200a93b49728d1443d56 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sun, 28 Sep 2014 03:48:28 -0700
Subject: [PATCH 11/40] cv::Mat::convertTo with scale and shift

---
 modules/core/src/convert.cpp | 722 ++++++++++++++++++++++++++++++++++-
 1 file changed, 721 insertions(+), 1 deletion(-)

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 0aecb6995..4855371d6 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1480,6 +1480,724 @@ cvtScaleAbs_( const T* src, size_t sstep,
     }
 }
 
+template <typename T, typename DT, typename WT>
+struct cvtScale_SIMD
+{
+    int operator () (const T *, DT *, int, WT, WT) const
+    {
+        return 0;
+    }
+};
+
+#if CV_NEON
+
+// from uchar
+
+template <>
+struct cvtScale_SIMD<uchar, uchar, float>
+{
+    int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1_u8(dst + x, vqmovn_u16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<uchar, schar, float>
+{
+    int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1_s8(dst + x, vqmovn_s16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<uchar, ushort, float>
+{
+    int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1q_u16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<uchar, short, float>
+{
+    int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1q_s16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<uchar, int, float>
+{
+    int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            vst1q_s32(dst + x, vcvtq_s32_f32(v_dst1));
+            vst1q_s32(dst + x + 4, vcvtq_s32_f32(v_dst2));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<uchar, float, float>
+{
+    int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
+            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
+            vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
+        }
+
+        return x;
+    }
+};
+
+// from schar
+
+template <>
+struct cvtScale_SIMD<schar, uchar, float>
+{
+    int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1_u8(dst + x, vqmovn_u16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<schar, schar, float>
+{
+    int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1_s8(dst + x, vqmovn_s16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<schar, ushort, float>
+{
+    int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1q_u16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<schar, short, float>
+{
+    int operator () (const schar * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1q_s16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<schar, int, float>
+{
+    int operator () (const schar * src, int * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
+
+            vst1q_s32(dst + x, vcvtq_s32_f32(v_dst1));
+            vst1q_s32(dst + x + 4, vcvtq_s32_f32(v_dst2));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<schar, float, float>
+{
+    int operator () (const schar * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
+            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
+            vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
+        }
+
+        return x;
+    }
+};
+
+// from ushort
+
+template <>
+struct cvtScale_SIMD<ushort, uchar, float>
+{
+    int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vld1q_u16(src + x);
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1_u8(dst + x, vqmovn_u16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<ushort, schar, float>
+{
+    int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vld1q_u16(src + x);
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1_s8(dst + x, vqmovn_s16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<ushort, ushort, float>
+{
+    int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vld1q_u16(src + x);
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1q_u16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<ushort, short, float>
+{
+    int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vld1q_u16(src + x);
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1q_s16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<ushort, int, float>
+{
+    int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vld1q_u16(src + x);
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
+
+            vst1q_s32(dst + x, vcvtq_s32_f32(v_dst1));
+            vst1q_s32(dst + x + 4, vcvtq_s32_f32(v_dst2));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<ushort, float, float>
+{
+    int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vld1q_u16(src + x);
+            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
+            vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
+        }
+
+        return x;
+    }
+};
+
+// from short
+
+template <>
+struct cvtScale_SIMD<short, uchar, float>
+{
+    int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vld1q_s16(src + x);
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1_u8(dst + x, vqmovn_u16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<short, schar, float>
+{
+    int operator () (const short * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vld1q_s16(src + x);
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1_s8(dst + x, vqmovn_s16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<short, ushort, float>
+{
+    int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vld1q_s16(src + x);
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1q_u16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<short, float, float>
+{
+    int operator () (const short * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vld1q_s16(src + x);
+            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
+            vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
+        }
+
+        return x;
+    }
+};
+
+// from int
+
+template <>
+struct cvtScale_SIMD<int, uchar, float>
+{
+    int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1_u8(dst + x, vqmovn_u16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<int, schar, float>
+{
+    int operator () (const int * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1_s8(dst + x, vqmovn_s16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<int, ushort, float>
+{
+    int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1q_u16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<int, short, float>
+{
+    int operator () (const int * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                            vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1q_s16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+// from float
+
+template <>
+struct cvtScale_SIMD<float, uchar, float>
+{
+    int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1_u8(dst + x, vqmovn_u16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<float, schar, float>
+{
+    int operator () (const float * src, schar * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1_s8(dst + x, vqmovn_s16(v_dst));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<float, ushort, float>
+{
+    int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
+
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
+                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            vst1q_u16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<float, short, float>
+{
+    int operator () (const float * src, short * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
+            float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
+
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
+                                            vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            vst1q_s16(dst + x, v_dst);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<float, int, float>
+{
+    int operator () (const float * src, int * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 4; x += 4)
+            vst1q_s32(dst + x, vcvtq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
+
+        return x;
+    }
+};
+
+template <>
+struct cvtScale_SIMD<float, float, float>
+{
+    int operator () (const float * src, float * dst, int width, float scale, float shift) const
+    {
+        int x = 0;
+        float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
+
+        for ( ; x <= width - 4; x += 4)
+            vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift));
+
+        return x;
+    }
+};
+
+#endif
+
 template<typename T, typename DT, typename WT> static void
 cvtScale_( const T* src, size_t sstep,
            DT* dst, size_t dstep, Size size,
@@ -1488,9 +2206,11 @@ cvtScale_( const T* src, size_t sstep,
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
 
+    cvtScale_SIMD<T, DT, WT> vop;
+
     for( ; size.height--; src += sstep, dst += dstep )
     {
-        int x = 0;
+        int x = vop(src, dst, size.width, scale, shift);
 
         #if CV_ENABLE_UNROLLED
         for( ; x <= size.width - 4; x += 4 )

From 34a571d37f9dbe3f7095f68b9a0ac06e5f4a561b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sun, 28 Sep 2014 04:35:33 -0700
Subject: [PATCH 12/40] cv::Mat::dot

---
 modules/core/src/matmul.cpp | 121 ++++++++++++++++++++++++++++++++++--
 1 file changed, 115 insertions(+), 6 deletions(-)

diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index c5ce6d0fb..b86b5929e 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -2804,7 +2804,8 @@ dotProd_(const T* src1, const T* src2, int len)
 {
     int i = 0;
     double result = 0;
-     #if CV_ENABLE_UNROLLED
+
+    #if CV_ENABLE_UNROLLED
     for( ; i <= len - 4; i += 4 )
         result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] +
             (double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
@@ -2833,10 +2834,12 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
     {
         int j, len0 = len & -4, blockSize0 = (1 << 13), blockSize;
         __m128i z = _mm_setzero_si128();
+        CV_DECL_ALIGNED(16) int buf[4];
+
         while( i < len0 )
         {
             blockSize = std::min(len0 - i, blockSize0);
-            __m128i s = _mm_setzero_si128();
+            __m128i s = z;
             j = 0;
             for( ; j <= blockSize - 16; j += 16 )
             {
@@ -2860,7 +2863,7 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
                 s0 = _mm_madd_epi16(s0, s1);
                 s = _mm_add_epi32(s, s0);
             }
-            CV_DECL_ALIGNED(16) int buf[4];
+
             _mm_store_si128((__m128i*)buf, s);
             r += buf[0] + buf[1] + buf[2] + buf[3];
 
@@ -2869,6 +2872,45 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
             i += blockSize;
         }
     }
+#elif CV_NEON
+    int len0 = len & -8, blockSize0 = (1 << 15), blockSize;
+    uint32x4_t v_zero = vdupq_n_u32(0u);
+    CV_DECL_ALIGNED(16) uint buf[4];
+
+    while( i < len0 )
+    {
+        blockSize = std::min(len0 - i, blockSize0);
+        uint32x4_t v_sum = v_zero;
+
+        int j = 0;
+        for( ; j <= blockSize - 16; j += 16 )
+        {
+            uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j);
+
+            uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2));
+            v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
+            v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
+
+            v_src10 = vmovl_u8(vget_high_u8(v_src1));
+            v_src20 = vmovl_u8(vget_high_u8(v_src2));
+            v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
+            v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
+        }
+
+        for( ; j <= blockSize - 8; j += 8 )
+        {
+            uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j));
+            v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2));
+            v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2));
+        }
+
+        vst1q_u32(buf, v_sum);
+        r += buf[0] + buf[1] + buf[2] + buf[3];
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
+    }
 #endif
     return r + dotProd_(src1, src2, len - i);
 }
@@ -2876,7 +2918,51 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
 
 static double dotProd_8s(const schar* src1, const schar* src2, int len)
 {
-    return dotProd_(src1, src2, len);
+    int i = 0;
+    double r = 0.0;
+
+#if CV_NEON
+    int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
+    int32x4_t v_zero = vdupq_n_s32(0);
+    CV_DECL_ALIGNED(16) int buf[4];
+
+    while( i < len0 )
+    {
+        blockSize = std::min(len0 - i, blockSize0);
+        int32x4_t v_sum = v_zero;
+
+        int j = 0;
+        for( ; j <= blockSize - 16; j += 16 )
+        {
+            int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j);
+
+            int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2));
+            v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
+            v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
+
+            v_src10 = vmovl_s8(vget_high_s8(v_src1));
+            v_src20 = vmovl_s8(vget_high_s8(v_src2));
+            v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
+            v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
+        }
+
+        for( ; j <= blockSize - 8; j += 8 )
+        {
+            int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j));
+            v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2));
+            v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2));
+        }
+
+        vst1q_s32(buf, v_sum);
+        r += buf[0] + buf[1] + buf[2] + buf[3];
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
+    }
+#endif
+
+    return r + dotProd_(src1, src2, len - i);
 }
 
 static double dotProd_16u(const ushort* src1, const ushort* src2, int len)
@@ -2914,13 +3000,36 @@ static double dotProd_32s(const int* src1, const int* src2, int len)
 
 static double dotProd_32f(const float* src1, const float* src2, int len)
 {
+    double r = 0.0;
+    int i = 0;
+
 #if (ARITHM_USE_IPP == 1)
-    double r = 0;
     if (0 <= ippsDotProd_32f64f(src1, src2, len, &r))
         return r;
     setIppErrorStatus();
+#elif CV_NEON
+    int len0 = len & -4, blockSize0 = (1 << 15), blockSize;
+    float32x4_t v_zero = vdupq_n_f32(0.0f);
+    CV_DECL_ALIGNED(16) float buf[4];
+
+    while( i < len0 )
+    {
+        blockSize = std::min(len0 - i, blockSize0);
+        float32x4_t v_sum = v_zero;
+
+        int j = 0;
+        for( ; j <= blockSize - 4; j += 4 )
+            v_sum = vmlaq_f32(v_sum, vld1q_f32(src1 + j), vld1q_f32(src2 + j));
+
+        vst1q_f32(buf, v_sum);
+        r += buf[0] + buf[1] + buf[2] + buf[3];
+
+        src1 += blockSize;
+        src2 += blockSize;
+        i += blockSize;
+    }
 #endif
-    return dotProd_(src1, src2, len);
+    return r + dotProd_(src1, src2, len - i);
 }
 
 static double dotProd_64f(const double* src1, const double* src2, int len)

From 44ea50f1c486f728cba3f49684a633161f3d432b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sun, 28 Sep 2014 06:36:07 -0700
Subject: [PATCH 13/40] cv::countNonZero

---
 modules/core/src/stat.cpp | 145 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 140 insertions(+), 5 deletions(-)

diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 2cc3c8d34..ede9e3d46 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -181,7 +181,7 @@ struct Sum_SIMD<ushort, int>
 
         for ( ; x <= len - 4; x += 4)
             v_sum = vaddq_u32(v_sum, vmovl_u16(vld1_u16(src0 + x)));
-      
+
         unsigned int CV_DECL_ALIGNED(16) ar[4];
         vst1q_u32(ar, v_sum);
 
@@ -214,7 +214,7 @@ struct Sum_SIMD<short, int>
 
         for ( ; x <= len - 4; x += 4)
             v_sum = vaddq_s32(v_sum, vmovl_s16(vld1_s16(src0 + x)));
-      
+
         int CV_DECL_ALIGNED(16) ar[4];
         vst1q_s32(ar, v_sum);
 
@@ -426,6 +426,38 @@ static int countNonZero8u( const uchar* src, int len )
             nz += tab[val & 255] + tab[val >> 8];
         }
     }
+#elif CV_NEON
+    int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6;
+    uint32x4_t v_nz = vdupq_n_u32(0u);
+    uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1);
+    const uchar * src0 = src;
+
+    while( i < len0 )
+    {
+        int blockSizei = std::min(len0 - i, blockSize0), j = 0;
+
+        while (j < blockSizei)
+        {
+            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
+            uint8x16_t v_pz = v_zero;
+
+            for( ; k <= blockSizej - 16; k += 16 )
+                v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1));
+
+            uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz));
+            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz);
+            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz);
+
+            src0 += blockSizej;
+            j += blockSizej;
+        }
+
+        i += blockSizei;
+    }
+
+    CV_DECL_ALIGNED(16) unsigned int buf[4];
+    vst1q_u32(buf, v_nz);
+    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
 #endif
     for( ; i < len; i++ )
         nz += src[i] != 0;
@@ -433,13 +465,116 @@ static int countNonZero8u( const uchar* src, int len )
 }
 
 static int countNonZero16u( const ushort* src, int len )
-{ return countNonZero_(src, len); }
+{
+    int i = 0, nz = 0;
+#if CV_NEON
+    int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
+    uint32x4_t v_nz = vdupq_n_u32(0u);
+    uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1);
+
+    while( i < len0 )
+    {
+        int blockSizei = std::min(len0 - i, blockSize0), j = 0;
+
+        while (j < blockSizei)
+        {
+            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
+            uint16x8_t v_pz = v_zero;
+
+            for( ; k <= blockSizej - 8; k += 8 )
+                v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1));
+
+            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
+
+            src += blockSizej;
+            j += blockSizej;
+        }
+
+        i += blockSizei;
+    }
+
+    CV_DECL_ALIGNED(16) unsigned int buf[4];
+    vst1q_u32(buf, v_nz);
+    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
+#endif
+    return nz + countNonZero_(src, len - i);
+}
 
 static int countNonZero32s( const int* src, int len )
-{ return countNonZero_(src, len); }
+{
+    int i = 0, nz = 0;
+#if CV_NEON
+    int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
+    uint32x4_t v_nz = vdupq_n_u32(0u);
+    int32x4_t v_zero = vdupq_n_s32(0.0f);
+    uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
+
+    while( i < len0 )
+    {
+        int blockSizei = std::min(len0 - i, blockSize0), j = 0;
+
+        while (j < blockSizei)
+        {
+            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
+            uint16x8_t v_pz = v_zerou;
+
+            for( ; k <= blockSizej - 8; k += 8 )
+                v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)),
+                                                              vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1));
+
+            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
+
+            src += blockSizej;
+            j += blockSizej;
+        }
+
+        i += blockSizei;
+    }
+
+    CV_DECL_ALIGNED(16) unsigned int buf[4];
+    vst1q_u32(buf, v_nz);
+    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
+#endif
+    return nz + countNonZero_(src, len - i);
+}
 
 static int countNonZero32f( const float* src, int len )
-{ return countNonZero_(src, len); }
+{
+    int i = 0, nz = 0;
+#if CV_NEON
+    int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6;
+    uint32x4_t v_nz = vdupq_n_u32(0u);
+    float32x4_t v_zero = vdupq_n_f32(0.0f);
+    uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u);
+
+    while( i < len0 )
+    {
+        int blockSizei = std::min(len0 - i, blockSize0), j = 0;
+
+        while (j < blockSizei)
+        {
+            int blockSizej = std::min(blockSizei - j, blockSize1), k = 0;
+            uint16x8_t v_pz = v_zerou;
+
+            for( ; k <= blockSizej - 8; k += 8 )
+                v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)),
+                                                              vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1));
+
+            v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz);
+
+            src += blockSizej;
+            j += blockSizej;
+        }
+
+        i += blockSizei;
+    }
+
+    CV_DECL_ALIGNED(16) unsigned int buf[4];
+    vst1q_u32(buf, v_nz);
+    nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]);
+#endif
+    return nz + countNonZero_(src, len - i);
+}
 
 static int countNonZero64f( const double* src, int len )
 { return countNonZero_(src, len); }

From f50f0ba63e0fce241727fe4ef62a96a58254250b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sun, 28 Sep 2014 07:28:33 -0700
Subject: [PATCH 14/40] cv::norm

---
 modules/core/src/stat.cpp | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index ede9e3d46..14cdcc803 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -2051,6 +2051,17 @@ float normL2Sqr_(const float* a, const float* b, int n)
         d = buf[0] + buf[1] + buf[2] + buf[3];
     }
     else
+#elif CV_NEON
+    float32x4_t v_sum = vdupq_n_f32(0.0f);
+    for ( ; j <= n - 4; j += 4)
+    {
+        float32x4_t v_diff = vmulq_f32(vld1q_f32(a + j), vld1q_f32(b + j));
+        v_sum = vaddq_f32(v_sum, vmulq_f32(v_diff, v_diff));
+    }
+
+    float CV_DECL_ALIGNED(16) buf[4];
+    vst1q_f32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
     {
         for( ; j <= n - 4; j += 4 )
@@ -2091,6 +2102,14 @@ float normL1_(const float* a, const float* b, int n)
         d = buf[0] + buf[1] + buf[2] + buf[3];
     }
     else
+#elif CV_NEON
+    float32x4_t v_sum = vdupq_n_f32(0.0f);
+    for ( ; j <= n - 4; j += 4)
+        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
+
+    float CV_DECL_ALIGNED(16) buf[4];
+    vst1q_f32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
     {
         for( ; j <= n - 4; j += 4 )
@@ -2131,6 +2150,19 @@ int normL1_(const uchar* a, const uchar* b, int n)
         d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
     }
     else
+#elif CV_NEON
+    uint32x4_t v_sum = vdupq_n_u32(0.0f);
+    for ( ; j <= n - 16; j += 16)
+    {
+        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
+        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
+    }
+
+    uint CV_DECL_ALIGNED(16) buf[4];
+    vst1q_u32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
     {
         for( ; j <= n - 4; j += 4 )

From bbc161e1cb073e437ea1c4ed514d05e021c71c45 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sun, 28 Sep 2014 14:51:30 -0700
Subject: [PATCH 15/40] fix for cv::Mat::convertTo with scale

---
 modules/core/src/convert.cpp | 106 +++++++++++++++++------------------
 1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 4855371d6..67adfde74 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1507,8 +1507,8 @@ struct cvtScale_SIMD<uchar, uchar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1_u8(dst + x, vqmovn_u16(v_dst));
         }
 
@@ -1530,8 +1530,8 @@ struct cvtScale_SIMD<uchar, schar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1_s8(dst + x, vqmovn_s16(v_dst));
         }
 
@@ -1553,8 +1553,8 @@ struct cvtScale_SIMD<uchar, ushort, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1q_u16(dst + x, v_dst);
         }
 
@@ -1576,8 +1576,8 @@ struct cvtScale_SIMD<uchar, short, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1q_s16(dst + x, v_dst);
         }
 
@@ -1599,8 +1599,8 @@ struct cvtScale_SIMD<uchar, int, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            vst1q_s32(dst + x, vcvtq_s32_f32(v_dst1));
-            vst1q_s32(dst + x + 4, vcvtq_s32_f32(v_dst2));
+            vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
+            vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
         }
 
         return x;
@@ -1642,8 +1642,8 @@ struct cvtScale_SIMD<schar, uchar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1_u8(dst + x, vqmovn_u16(v_dst));
         }
 
@@ -1665,8 +1665,8 @@ struct cvtScale_SIMD<schar, schar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1_s8(dst + x, vqmovn_s16(v_dst));
         }
 
@@ -1688,8 +1688,8 @@ struct cvtScale_SIMD<schar, ushort, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1q_u16(dst + x, v_dst);
         }
 
@@ -1711,8 +1711,8 @@ struct cvtScale_SIMD<schar, short, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1q_s16(dst + x, v_dst);
         }
 
@@ -1734,8 +1734,8 @@ struct cvtScale_SIMD<schar, int, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
 
-            vst1q_s32(dst + x, vcvtq_s32_f32(v_dst1));
-            vst1q_s32(dst + x + 4, vcvtq_s32_f32(v_dst2));
+            vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
+            vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
         }
 
         return x;
@@ -1777,8 +1777,8 @@ struct cvtScale_SIMD<ushort, uchar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1_u8(dst + x, vqmovn_u16(v_dst));
         }
 
@@ -1800,8 +1800,8 @@ struct cvtScale_SIMD<ushort, schar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1_s8(dst + x, vqmovn_s16(v_dst));
         }
 
@@ -1823,8 +1823,8 @@ struct cvtScale_SIMD<ushort, ushort, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1q_u16(dst + x, v_dst);
         }
 
@@ -1846,8 +1846,8 @@ struct cvtScale_SIMD<ushort, short, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1q_s16(dst + x, v_dst);
         }
 
@@ -1869,8 +1869,8 @@ struct cvtScale_SIMD<ushort, int, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
 
-            vst1q_s32(dst + x, vcvtq_s32_f32(v_dst1));
-            vst1q_s32(dst + x + 4, vcvtq_s32_f32(v_dst2));
+            vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
+            vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
         }
 
         return x;
@@ -1912,8 +1912,8 @@ struct cvtScale_SIMD<short, uchar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1_u8(dst + x, vqmovn_u16(v_dst));
         }
 
@@ -1935,8 +1935,8 @@ struct cvtScale_SIMD<short, schar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1_s8(dst + x, vqmovn_s16(v_dst));
         }
 
@@ -1958,8 +1958,8 @@ struct cvtScale_SIMD<short, ushort, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1q_u16(dst + x, v_dst);
         }
 
@@ -2001,8 +2001,8 @@ struct cvtScale_SIMD<int, uchar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1_u8(dst + x, vqmovn_u16(v_dst));
         }
 
@@ -2023,8 +2023,8 @@ struct cvtScale_SIMD<int, schar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1_s8(dst + x, vqmovn_s16(v_dst));
         }
 
@@ -2045,8 +2045,8 @@ struct cvtScale_SIMD<int, ushort, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1q_u16(dst + x, v_dst);
         }
 
@@ -2067,8 +2067,8 @@ struct cvtScale_SIMD<int, short, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                            vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1q_s16(dst + x, v_dst);
         }
 
@@ -2091,8 +2091,8 @@ struct cvtScale_SIMD<float, uchar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1_u8(dst + x, vqmovn_u16(v_dst));
         }
 
@@ -2113,8 +2113,8 @@ struct cvtScale_SIMD<float, schar, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                           vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                           vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1_s8(dst + x, vqmovn_s16(v_dst));
         }
 
@@ -2135,8 +2135,8 @@ struct cvtScale_SIMD<float, ushort, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
 
-            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst1)),
-                                            vqmovn_u32(vcvtq_u32_f32(v_dst2)));
+            uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
             vst1q_u16(dst + x, v_dst);
         }
 
@@ -2157,8 +2157,8 @@ struct cvtScale_SIMD<float, short, float>
             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
 
-            int16x8_t v_dst = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst1)),
-                                            vqmovn_s32(vcvtq_s32_f32(v_dst2)));
+            int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
             vst1q_s16(dst + x, v_dst);
         }
 
@@ -2175,7 +2175,7 @@ struct cvtScale_SIMD<float, int, float>
         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
 
         for ( ; x <= width - 4; x += 4)
-            vst1q_s32(dst + x, vcvtq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
+            vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
 
         return x;
     }

From 1c491c42cdac6b82eba318ce8cd021020bc69c2e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 29 Sep 2014 14:14:10 +0000
Subject: [PATCH 16/40] fix for cornerHarris

---
 modules/core/src/stat.cpp      | 11 -----------
 modules/imgproc/src/corner.cpp |  4 ++--
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 14cdcc803..d6f53dc7d 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -2051,17 +2051,6 @@ float normL2Sqr_(const float* a, const float* b, int n)
         d = buf[0] + buf[1] + buf[2] + buf[3];
     }
     else
-#elif CV_NEON
-    float32x4_t v_sum = vdupq_n_f32(0.0f);
-    for ( ; j <= n - 4; j += 4)
-    {
-        float32x4_t v_diff = vmulq_f32(vld1q_f32(a + j), vld1q_f32(b + j));
-        v_sum = vaddq_f32(v_sum, vmulq_f32(v_diff, v_diff));
-    }
-
-    float CV_DECL_ALIGNED(16) buf[4];
-    vst1q_f32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
     {
         for( ; j <= n - 4; j += 4 )
diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp
index 60d14cf0d..4d55a4f18 100644
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -126,7 +126,7 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
         if( simd )
         {
             __m128 k4 = _mm_set1_ps((float)k);
-            for( ; j <= size.width - 5; j += 4 )
+            for( ; j <= size.width - 4; j += 4 )
             {
                 __m128 t0 = _mm_loadu_ps(cov + j*3); // a0 b0 c0 x
                 __m128 t1 = _mm_loadu_ps(cov + j*3 + 3); // a1 b1 c1 x
@@ -151,7 +151,7 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
 
         for( ; j <= size.width - 4; j += 4 )
         {
-            float32x4x3_t v_src = vld3q_f32(cov + j + 3);
+            float32x4x3_t v_src = vld3q_f32(cov + j * 3);
             float32x4_t v_a = v_src.val[0], v_b = v_src.val[1], v_c = v_src.val[2];
             float32x4_t v_ac_bb = vmlsq_f32(vmulq_f32(v_a, v_c), v_b, v_b);
             float32x4_t v_ac = vaddq_f32(v_a, v_c);

From 8e1ccfae3d54501246ef354a5f4b22d8a74dc756 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 29 Sep 2014 15:57:33 +0000
Subject: [PATCH 17/40] cv::boxFilter

---
 modules/imgproc/src/smooth.cpp | 263 ++++++++++++++++++++++++++++++++-
 1 file changed, 255 insertions(+), 8 deletions(-)

diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index 0872c44c6..cc1bf2826 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -132,8 +132,8 @@ struct ColumnSum :
         SUM = &sum[0];
         if( sumCount == 0 )
         {
-            for( i = 0; i < width; i++ )
-                SUM[i] = 0;
+            memset((void*)SUM, 0, width*sizeof(ST));
+
             for( ; sumCount < ksize - 1; sumCount++, src++ )
             {
                 const ST* Sp = (const ST*)src[0];
@@ -247,13 +247,16 @@ struct ColumnSum<int, uchar> :
                 #if CV_SSE2
                 if(haveSSE2)
                 {
-                    for( ; i < width-4; i+=4 )
+                    for( ; i <= width-4; i+=4 )
                     {
                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
                     }
                 }
+                #elif CV_NEON
+                for( ; i <= width - 4; i+=4 )
+                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
                 #endif
                 for( ; i < width; i++ )
                     SUM[i] += Sp[i];
@@ -277,7 +280,7 @@ struct ColumnSum<int, uchar> :
                 if(haveSSE2)
                 {
                     const __m128 scale4 = _mm_set1_ps((float)_scale);
-                    for( ; i < width-8; i+=8 )
+                    for( ; i <= width-8; i+=8 )
                     {
                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
@@ -298,6 +301,22 @@ struct ColumnSum<int, uchar> :
                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
                     }
                 }
+                #elif CV_NEON
+                float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                for( ; i <= width-8; i+=8 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+                    uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                    uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+
+                    uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d));
+                    vst1_u8(D + i, vqmovn_u16(v_dst));
+
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                }
                 #endif
                 for( ; i < width; i++ )
                 {
@@ -312,7 +331,7 @@ struct ColumnSum<int, uchar> :
                 #if CV_SSE2
                 if(haveSSE2)
                 {
-                    for( ; i < width-8; i+=8 )
+                    for( ; i <= width-8; i+=8 )
                     {
                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
@@ -330,6 +349,18 @@ struct ColumnSum<int, uchar> :
                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
                     }
                 }
+                #elif CV_NEON
+                for( ; i <= width-8; i+=8 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+                    uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01));
+                    vst1_u8(D + i, vqmovn_u16(v_dst));
+
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                }
                 #endif
 
                 for( ; i < width; i++ )
@@ -390,13 +421,16 @@ struct ColumnSum<int, short> :
                 #if CV_SSE2
                 if(haveSSE2)
                 {
-                    for( ; i < width-4; i+=4 )
+                    for( ; i <= width-4; i+=4 )
                     {
                         __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
                         __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
                         _mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
                     }
                 }
+                #elif CV_NEON
+                for( ; i <= width - 4; i+=4 )
+                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
                 #endif
                 for( ; i < width; i++ )
                     SUM[i] += Sp[i];
@@ -420,7 +454,7 @@ struct ColumnSum<int, short> :
                 if(haveSSE2)
                 {
                     const __m128 scale4 = _mm_set1_ps((float)_scale);
-                    for( ; i < width-8; i+=8 )
+                    for( ; i <= width-8; i+=8 )
                     {
                         __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
                         __m128i _sm1  = _mm_loadu_si128((const __m128i*)(Sm+i+4));
@@ -439,6 +473,20 @@ struct ColumnSum<int, short> :
                         _mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1));
                     }
                 }
+                #elif CV_NEON
+                float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                for( ; i <= width-8; i+=8 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+                    int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                    int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+                    vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d)));
+
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                }
                 #endif
                 for( ; i < width; i++ )
                 {
@@ -453,7 +501,7 @@ struct ColumnSum<int, short> :
                 #if CV_SSE2
                 if(haveSSE2)
                 {
-                    for( ; i < width-8; i+=8 )
+                    for( ; i <= width-8; i+=8 )
                     {
 
                         __m128i _sm  = _mm_loadu_si128((const __m128i*)(Sm+i));
@@ -470,6 +518,17 @@ struct ColumnSum<int, short> :
                         _mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
                     }
                 }
+                #elif CV_NEON
+                for( ; i <= width-8; i+=8 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+                    vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01)));
+
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                }
                 #endif
 
                 for( ; i < width; i++ )
@@ -537,6 +596,9 @@ struct ColumnSum<int, ushort> :
                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
                     }
                 }
+                #elif CV_NEON
+                for( ; i <= width - 4; i+=4 )
+                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
                 #endif
                 for( ; i < width; i++ )
                     SUM[i] += Sp[i];
@@ -578,6 +640,20 @@ struct ColumnSum<int, ushort> :
                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
                     }
                 }
+                #elif CV_NEON
+                float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                for( ; i <= width-8; i+=8 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+                    uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                    uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+                    vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)));
+
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                }
                 #endif
                 for( ; i < width; i++ )
                 {
@@ -608,6 +684,17 @@ struct ColumnSum<int, ushort> :
                         _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
                     }
                 }
+                #elif CV_NEON
+                for( ; i <= width-8; i+=8 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+                    vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)));
+
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                }
                 #endif
 
                 for( ; i < width; i++ )
@@ -626,6 +713,166 @@ struct ColumnSum<int, ushort> :
     std::vector<int> sum;
 };
 
+template<>
+struct ColumnSum<int, float> :
+        public BaseColumnFilter
+{
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
+    {
+        ksize = _ksize;
+        anchor = _anchor;
+        scale = _scale;
+        sumCount = 0;
+    }
+
+    virtual void reset() { sumCount = 0; }
+
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
+    {
+        int i;
+        int* SUM;
+        bool haveScale = scale != 1;
+        double _scale = scale;
+
+        #if CV_SSE2
+        bool haveSSE2 =  checkHardwareSupport(CV_CPU_SSE2);
+        #endif
+
+        if( width != (int)sum.size() )
+        {
+            sum.resize(width);
+            sumCount = 0;
+        }
+
+        SUM = &sum[0];
+        if( sumCount == 0 )
+        {
+            memset((void *)SUM, 0, sizeof(int) * width);
+
+            for( ; sumCount < ksize - 1; sumCount++, src++ )
+            {
+                const int* Sp = (const int*)src[0];
+                i = 0;
+
+                #if CV_SSE2
+                if(haveSSE2)
+                {
+                    for( ; i < width-4; i+=4 )
+                    {
+                        __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
+                        __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
+                        _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
+                    }
+                }
+                #elif CV_NEON
+                for( ; i <= width - 4; i+=4 )
+                    vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+                #endif
+
+                for( ; i < width; i++ )
+                    SUM[i] += Sp[i];
+            }
+        }
+        else
+        {
+            CV_Assert( sumCount == ksize-1 );
+            src += ksize-1;
+        }
+
+        for( ; count--; src++ )
+        {
+            const int * Sp = (const int*)src[0];
+            const int * Sm = (const int*)src[1-ksize];
+            float* D = (float*)dst;
+            if( haveScale )
+            {
+                i = 0;
+
+                #if CV_SSE2
+                if(haveSSE2)
+                {
+                    const __m128 scale4 = _mm_set1_ps((float)_scale);
+
+                    for( ; i < width-4; i+=4)
+                    {
+                        __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
+                        __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
+                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
+
+                        _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
+                        _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
+                    }
+                }
+                #elif CV_NEON
+                float32x4_t v_scale = vdupq_n_f32((float)_scale);
+                for( ; i <= width-8; i+=8 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+                    vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+                    vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                }
+                #endif
+
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = (float)(s0*_scale);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            else
+            {
+                i = 0;
+
+                #if CV_SSE2
+                if(haveSSE2)
+                {
+                    for( ; i < width-4; i+=4)
+                    {
+                        __m128i _sm   = _mm_loadu_si128((const __m128i*)(Sm+i));
+                        __m128i _s0   = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
+                                                      _mm_loadu_si128((const __m128i*)(Sp+i)));
+
+                        _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0));
+                        _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
+                    }
+                }
+                #elif CV_NEON
+                for( ; i <= width-8; i+=8 )
+                {
+                    int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+                    int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+                    vst1q_f32(D + i, vcvtq_f32_s32(v_s0));
+                    vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01));
+
+                    vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+                    vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+                }
+                #endif
+
+                for( ; i < width; i++ )
+                {
+                    int s0 = SUM[i] + Sp[i];
+                    D[i] = (float)(s0);
+                    SUM[i] = s0 - Sm[i];
+                }
+            }
+            dst += dststep;
+        }
+    }
+
+    double scale;
+    int sumCount;
+    std::vector<int> sum;
+};
+
 #ifdef HAVE_OPENCL
 
 #define DIVUP(total, grain) ((total + grain - 1) / (grain))

From 81548a30aae65590c65f4fbb70119b413097990f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 30 Sep 2014 07:58:09 +0000
Subject: [PATCH 18/40] cv::medianBLur

---
 modules/imgproc/src/smooth.cpp | 89 ++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 4 deletions(-)

diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index cc1bf2826..d9678f2d6 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1607,6 +1607,21 @@ static inline void histogram_sub_simd( const HT x[16], HT y[16] )
     _mm_store_si128(ry+1, r1);
 }
 
+#elif CV_NEON
+#define MEDIAN_HAVE_SIMD 1
+
+static inline void histogram_add_simd( const HT x[16], HT y[16] )
+{
+    vst1q_u16(y, vaddq_u16(vld1q_u16(x), vld1q_u16(y)));
+    vst1q_u16(y + 8, vaddq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8)));
+}
+
+static inline void histogram_sub_simd( const HT x[16], HT y[16] )
+{
+    vst1q_u16(y, vsubq_u16(vld1q_u16(x), vld1q_u16(y)));
+    vst1q_u16(y + 8, vsubq_u16(vld1q_u16(x + 8), vld1q_u16(y + 8)));
+}
+
 #else
 #define MEDIAN_HAVE_SIMD 0
 #endif
@@ -1660,7 +1675,7 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize )
     HT* h_coarse = alignPtr(&_h_coarse[0], 16);
     HT* h_fine = alignPtr(&_h_fine[0], 16);
 #if MEDIAN_HAVE_SIMD
-    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
 #endif
 
     for( int x = 0; x < _dst.cols; x += STRIPE_SIZE )
@@ -2108,6 +2123,71 @@ struct MinMaxVec32f
     }
 };
 
+#elif CV_NEON
+
+struct MinMaxVec8u
+{
+    typedef uchar value_type;
+    typedef uint8x16_t arg_type;
+    enum { SIZE = 16 };
+    arg_type load(const uchar* ptr) { return vld1q_u8(ptr); }
+    void store(uchar* ptr, arg_type val) { vst1q_u8(ptr, val); }
+    void operator()(arg_type& a, arg_type& b) const
+    {
+        arg_type t = a;
+        a = vminq_u8(a, b);
+        b = vmaxq_u8(b, t);
+    }
+};
+
+
+struct MinMaxVec16u
+{
+    typedef ushort value_type;
+    typedef uint16x8_t arg_type;
+    enum { SIZE = 8 };
+    arg_type load(const ushort* ptr) { return vld1q_u16(ptr); }
+    void store(ushort* ptr, arg_type val) { vst1q_u16(ptr, val); }
+    void operator()(arg_type& a, arg_type& b) const
+    {
+        arg_type t = a;
+        a = vminq_u16(a, b);
+        b = vmaxq_u16(b, t);
+    }
+};
+
+
+struct MinMaxVec16s
+{
+    typedef short value_type;
+    typedef int16x8_t arg_type;
+    enum { SIZE = 8 };
+    arg_type load(const short* ptr) { return vld1q_s16(ptr); }
+    void store(short* ptr, arg_type val) { vst1q_s16(ptr, val); }
+    void operator()(arg_type& a, arg_type& b) const
+    {
+        arg_type t = a;
+        a = vminq_s16(a, b);
+        b = vmaxq_s16(b, t);
+    }
+};
+
+
+struct MinMaxVec32f
+{
+    typedef float value_type;
+    typedef float32x4_t arg_type;
+    enum { SIZE = 4 };
+    arg_type load(const float* ptr) { return vld1q_f32(ptr); }
+    void store(float* ptr, arg_type val) { vst1q_f32(ptr, val); }
+    void operator()(arg_type& a, arg_type& b) const
+    {
+        arg_type t = a;
+        a = vminq_f32(a, b);
+        b = vmaxq_f32(b, t);
+    }
+};
+
 
 #else
 
@@ -2134,7 +2214,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
     int i, j, k, cn = _src.channels();
     Op op;
     VecOp vop;
-    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
 
     if( m == 3 )
     {
@@ -2450,7 +2530,7 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
 #endif
 
     bool useSortNet = ksize == 3 || (ksize == 5
-#if !CV_SSE2
+#if !(CV_SSE2 || CV_NEON)
             && src0.depth() > CV_8U
 #endif
         );
@@ -2484,7 +2564,8 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
         CV_Assert( src.depth() == CV_8U && (cn == 1 || cn == 3 || cn == 4) );
 
         double img_size_mp = (double)(src0.total())/(1 << 20);
-        if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)*(MEDIAN_HAVE_SIMD && checkHardwareSupport(CV_CPU_SSE2) ? 1 : 3))
+        if( ksize <= 3 + (img_size_mp < 1 ? 12 : img_size_mp < 4 ? 6 : 2)*
+            (MEDIAN_HAVE_SIMD && (checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON)) ? 1 : 3))
             medianBlur_8u_Om( src, dst, ksize );
         else
             medianBlur_8u_O1( src, dst, ksize );

From a3e56114d14de7025efd4dfed5d89e2a04619d53 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 30 Sep 2014 14:20:22 +0000
Subject: [PATCH 19/40] cv::multiply

---
 modules/core/src/arithm.cpp | 238 +++++++++++++++++++++++++++++++++++-
 1 file changed, 236 insertions(+), 2 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 630dcacf4..a1c0d3cfd 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1987,6 +1987,238 @@ void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
 namespace cv
 {
 
+template <typename T, typename WT>
+struct Mul_SIMD
+{
+    int operator() (const T *, const T *, T *, int, WT) const
+    {
+        return 0;
+    }
+};
+
+#if CV_NEON
+
+template <>
+struct Mul_SIMD<uchar, float>
+{
+    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
+                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
+
+                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
+                vst1_u8(dst + x, vqmovn_u16(v_dst));
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
+                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
+                vst1_u8(dst + x, vqmovn_u16(v_dst));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Mul_SIMD<schar, float>
+{
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
+                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
+
+                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
+                vst1_s8(dst + x, vqmovn_s16(v_dst));
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
+                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
+                vst1_s8(dst + x, vqmovn_s16(v_dst));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Mul_SIMD<ushort, float>
+{
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
+
+                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
+                vst1q_u16(dst + x, v_dst);
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
+                vst1q_u16(dst + x, v_dst);
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Mul_SIMD<short, float>
+{
+    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
+
+                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
+                vst1q_s16(dst + x, v_dst);
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
+                vst1q_s16(dst + x, v_dst);
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Mul_SIMD<float, float>
+{
+    int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                vst1q_f32(dst + x, v_dst1);
+                vst1q_f32(dst + x + 4, v_dst2);
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+
+                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                vst1q_f32(dst + x, v_dst1);
+                vst1q_f32(dst + x + 4, v_dst2);
+            }
+        }
+
+        return x;
+    }
+};
+
+#endif
+
 template<typename T, typename WT> static void
 mul_( const T* src1, size_t step1, const T* src2, size_t step2,
       T* dst, size_t step, Size size, WT scale )
@@ -1995,11 +2227,13 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
     step2 /= sizeof(src2[0]);
     step /= sizeof(dst[0]);
 
+    Mul_SIMD<T, WT> vop;
+
     if( scale == (WT)1. )
     {
         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
         {
-            int i=0;
+            int i = vop(src1, src2, dst, size.width, scale);
             #if CV_ENABLE_UNROLLED
             for(; i <= size.width - 4; i += 4 )
             {
@@ -2024,7 +2258,7 @@ mul_( const T* src1, size_t step1, const T* src2, size_t step2,
     {
         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
         {
-            int i = 0;
+            int i = vop(src1, src2, dst, size.width, scale);
             #if CV_ENABLE_UNROLLED
             for(; i <= size.width - 4; i += 4 )
             {

From be3efdf274ca8749da15b3dc9a1bd35fb26b0580 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 30 Sep 2014 14:34:48 +0000
Subject: [PATCH 20/40] cv::sum refactoring

---
 modules/core/src/stat.cpp | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index d6f53dc7d..1abb31618 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -90,20 +90,20 @@ struct Sum_SIMD<uchar, int>
             uint8x16_t v_src = vld1q_u8(src0 + x);
             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
 
-            v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half)));
-            v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half)));
+            v_sum = vaddw_u16(v_sum, vget_low_u16(v_half));
+            v_sum = vaddw_u16(v_sum, vget_high_u16(v_half));
 
             v_half = vmovl_u8(vget_high_u8(v_src));
-            v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_half)));
-            v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_half)));
+            v_sum = vaddw_u16(v_sum, vget_low_u16(v_half));
+            v_sum = vaddw_u16(v_sum, vget_high_u16(v_half));
         }
 
         for ( ; x <= len - 8; x += 8)
         {
             uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x));
 
-            v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src)));
-            v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src)));
+            v_sum = vaddw_u16(v_sum, vget_low_u16(v_src));
+            v_sum = vaddw_u16(v_sum, vget_high_u16(v_src));
         }
 
         unsigned int CV_DECL_ALIGNED(16) ar[4];
@@ -133,20 +133,20 @@ struct Sum_SIMD<schar, int>
             int8x16_t v_src = vld1q_s8(src0 + x);
             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
 
-            v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half)));
-            v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half)));
+            v_sum = vaddw_s16(v_sum, vget_low_s16(v_half));
+            v_sum = vaddw_s16(v_sum, vget_high_s16(v_half));
 
             v_half = vmovl_s8(vget_high_s8(v_src));
-            v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_half)));
-            v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_half)));
+            v_sum = vaddw_s16(v_sum, vget_low_s16(v_half));
+            v_sum = vaddw_s16(v_sum, vget_high_s16(v_half));
         }
 
         for ( ; x <= len - 8; x += 8)
         {
             int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x));
 
-            v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src)));
-            v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src)));
+            v_sum = vaddw_s16(v_sum, vget_low_s16(v_src));
+            v_sum = vaddw_s16(v_sum, vget_high_s16(v_src));
         }
 
         int CV_DECL_ALIGNED(16) ar[4];
@@ -175,12 +175,12 @@ struct Sum_SIMD<ushort, int>
         {
             uint16x8_t v_src = vld1q_u16(src0 + x);
 
-            v_sum = vaddq_u32(v_sum, vmovl_u16(vget_low_u16(v_src)));
-            v_sum = vaddq_u32(v_sum, vmovl_u16(vget_high_u16(v_src)));
+            v_sum = vaddw_u16(v_sum, vget_low_u16(v_src));
+            v_sum = vaddw_u16(v_sum, vget_high_u16(v_src));
         }
 
         for ( ; x <= len - 4; x += 4)
-            v_sum = vaddq_u32(v_sum, vmovl_u16(vld1_u16(src0 + x)));
+            v_sum = vaddw_u16(v_sum, vld1_u16(src0 + x));
 
         unsigned int CV_DECL_ALIGNED(16) ar[4];
         vst1q_u32(ar, v_sum);
@@ -208,12 +208,12 @@ struct Sum_SIMD<short, int>
         {
             int16x8_t v_src = vld1q_s16(src0 + x);
 
-            v_sum = vaddq_s32(v_sum, vmovl_s16(vget_low_s16(v_src)));
-            v_sum = vaddq_s32(v_sum, vmovl_s16(vget_high_s16(v_src)));
+            v_sum = vaddw_s16(v_sum, vget_low_s16(v_src));
+            v_sum = vaddw_s16(v_sum, vget_high_s16(v_src));
         }
 
         for ( ; x <= len - 4; x += 4)
-            v_sum = vaddq_s32(v_sum, vmovl_s16(vld1_s16(src0 + x)));
+            v_sum = vaddw_s16(v_sum, vld1_s16(src0 + x));
 
         int CV_DECL_ALIGNED(16) ar[4];
         vst1q_s32(ar, v_sum);

From af6a64d76bfeed4e466f5450decbb86a51466347 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 30 Sep 2014 15:42:15 +0000
Subject: [PATCH 21/40] cv::accumulate functions

---
 modules/imgproc/src/accum.cpp | 466 +++++++++++++++++++++++++++++++++-
 1 file changed, 462 insertions(+), 4 deletions(-)

diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp
index 9f23d3443..7fd2f2a10 100644
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@@ -46,10 +46,468 @@
 namespace cv
 {
 
+template <typename T, typename AT>
+struct Acc_SIMD
+{
+    int operator() (const T *, AT *, const uchar *, int, int) const
+    {
+        return 0;
+    }
+};
+
+template <typename T, typename AT>
+struct AccSqr_SIMD
+{
+    int operator() (const T *, AT *, const uchar *, int, int) const
+    {
+        return 0;
+    }
+};
+
+template <typename T, typename AT>
+struct AccProd_SIMD
+{
+    int operator() (const T *, const T *, AT *, const uchar *, int, int) const
+    {
+        return 0;
+    }
+};
+
+template <typename T, typename AT>
+struct AccW_SIMD
+{
+    int operator() (const T *, AT *, const uchar *, int, int, AT) const
+    {
+        return 0;
+    }
+};
+
+#if CV_NEON
+
+template <>
+struct Acc_SIMD<uchar, float>
+{
+    int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
+    {
+        int x = 0;
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 16; x += 16)
+            {
+                uint8x16_t v_src = vld1q_u8(src);
+                uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
+                vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
+                vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
+            }
+        }
+        else if (cn == 1)
+        {
+            uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
+
+            for ( ; x <= len - 16; x += 16)
+            {
+                uint8x16_t v_src = vandq_u8(vld1q_u8(src), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
+                uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
+                vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
+                vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Acc_SIMD<ushort, float>
+{
+    int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
+    {
+        int x = 0;
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 8; x += 8)
+            {
+                uint16x8_t v_src = vld1q_u16(src + x);
+                uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src));
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Acc_SIMD<float, float>
+{
+    int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
+    {
+        int x = 0;
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 8; x += 8)
+            {
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vld1q_f32(src + x)));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src + x + 4)));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AccSqr_SIMD<uchar, float>
+{
+    int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
+    {
+        int x = 0;
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 16; x += 16)
+            {
+                uint8x16_t v_src = vld1q_u8(src);
+                uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src);
+                uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1);
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
+                vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
+                vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
+            }
+        }
+        else if (cn == 1)
+        {
+            uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
+
+            for ( ; x <= len - 16; x += 16)
+            {
+                uint8x16_t v_src = vandq_u8(vld1q_u8(src), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
+                uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src);
+                uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1);
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
+                vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
+                vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AccSqr_SIMD<ushort, float>
+{
+    int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
+    {
+        int x = 0;
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 8; x += 8)
+            {
+                uint16x8_t v_src = vld1q_u16(src + x);
+                uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src);
+                uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1);
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
+            }
+        }
+        else if (cn == 1)
+        {
+            uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0);
+
+            for ( ; x <= len - 8; x += 8)
+            {
+                uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0));
+                uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src);
+                uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])),
+                           v_src = vandq_u16(vld1q_u16(src + x), v_mask);
+
+                uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src);
+                uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1);
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AccSqr_SIMD<float, float>
+{
+    int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
+    {
+        int x = 0;
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 8; x += 8)
+            {
+                float32x4_t v_src = vld1q_f32(src + x);
+                vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), v_src, v_src));
+
+                v_src = vld1q_f32(src + x + 4);
+                vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), v_src, v_src));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AccProd_SIMD<uchar, float>
+{
+    int operator() (const uchar * src1, const uchar * src2, float * dst, const uchar * mask, int len, int cn) const
+    {
+        int x = 0;
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 16; x += 16)
+            {
+                uint8x16_t v_1src = vld1q_u8(src1), v_2src = vld1q_u8(src2);
+                uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)),
+                           v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src));
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
+                vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
+                vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
+            }
+        }
+        else if (cn == 1)
+        {
+            uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
+
+            for ( ; x <= len - 16; x += 16)
+            {
+                uint8x16_t v_mask = veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0));
+                uint8x16_t v_1src = vandq_u8(vld1q_u8(src1), v_mask), v_2src = vandq_u8(vld1q_u8(src2), v_mask);
+                uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)),
+                           v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src));
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
+                vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
+                vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AccProd_SIMD<ushort, float>
+{
+    int operator() (const ushort * src1, const ushort * src2, float * dst, const uchar * mask, int len, int cn) const
+    {
+        int x = 0;
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 8; x += 8)
+            {
+                uint16x8_t v_1src = vld1q_u16(src1 + x), v_2src = vld1q_u16(src2 + x);
+                uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)),
+                           v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src));
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
+            }
+        }
+        else if (cn == 1)
+        {
+            uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0);
+
+            for ( ; x <= len - 8; x += 8)
+            {
+                uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0));
+                uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src);
+                uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])),
+                           v_1src = vandq_u16(vld1q_u16(src1 + x), v_mask),
+                           v_2src = vandq_u16(vld1q_u16(src2 + x), v_mask);
+
+                uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)),
+                           v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src));
+
+                vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
+                vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AccProd_SIMD<float, float>
+{
+    int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const
+    {
+        int x = 0;
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 8; x += 8)
+            {
+                vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), vld1q_f32(src1 + x), vld1q_f32(src2 + x)));
+                vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AccW_SIMD<uchar, float>
+{
+    int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
+    {
+        int x = 0;
+        float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha);
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 16; x += 16)
+            {
+                uint8x16_t v_src = vld1q_u8(src);
+                uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
+
+                vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta),
+                                             vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), v_alpha));
+                vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta),
+                                             vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), v_alpha));
+                vst1q_f32(dst + x + 8, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 8), v_beta),
+                                                 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_alpha));
+                vst1q_f32(dst + x + 12, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 12), v_beta),
+                                                  vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_alpha));
+            }
+        }
+        else if (cn == 1)
+        {
+            uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
+
+            for ( ; x <= len - 16; x += 16)
+            {
+                uint8x16_t v_src = vandq_u8(vld1q_u8(src), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
+                uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
+
+                vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta),
+                                             vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), v_alpha));
+                vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta),
+                                                 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), v_alpha));
+                vst1q_f32(dst + x + 8, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 8), v_beta),
+                                                 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_alpha));
+                vst1q_f32(dst + x + 12, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 12), v_beta),
+                                                  vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_alpha));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AccW_SIMD<ushort, float>
+{
+    int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
+    {
+        int x = 0;
+        float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha);
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 8; x += 8)
+            {
+                uint16x8_t v_src = vld1q_u16(src + x);
+                uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src));
+
+                vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vcvtq_f32_u32(v_src0), v_alpha));
+                vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vcvtq_f32_u32(v_src1), v_alpha));
+            }
+        }
+        else if (cn == 1)
+        {
+            uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0);
+
+            for ( ; x <= len - 8; x += 8)
+            {
+                uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0));
+                uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src);
+                uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])),
+                           v_src = vandq_u16(vld1q_u16(src + x), v_mask);
+                uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src));
+
+                vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vcvtq_f32_u32(v_src0), v_alpha));
+                vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vcvtq_f32_u32(v_src1), v_alpha));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AccW_SIMD<float, float>
+{
+    int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
+    {
+        int x = 0;
+        float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha);
+
+        if (!mask)
+        {
+            len *= cn;
+            for ( ; x <= len - 8; x += 8)
+            {
+                vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vld1q_f32(src + x), v_alpha));
+                vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vld1q_f32(src + x + 4), v_alpha));
+            }
+        }
+
+        return x;
+    }
+};
+
+#endif
+
 template<typename T, typename AT> void
 acc_( const T* src, AT* dst, const uchar* mask, int len, int cn )
 {
-    int i = 0;
+    int i = Acc_SIMD<T, AT>()(src, dst, mask, len, cn);
 
     if( !mask )
     {
@@ -107,7 +565,7 @@ acc_( const T* src, AT* dst, const uchar* mask, int len, int cn )
 template<typename T, typename AT> void
 accSqr_( const T* src, AT* dst, const uchar* mask, int len, int cn )
 {
-    int i = 0;
+    int i = AccSqr_SIMD<T, AT>()(src, dst, mask, len, cn);
 
     if( !mask )
     {
@@ -165,7 +623,7 @@ accSqr_( const T* src, AT* dst, const uchar* mask, int len, int cn )
 template<typename T, typename AT> void
 accProd_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn )
 {
-    int i = 0;
+    int i = AccProd_SIMD<T, AT>()(src1, src2, dst, mask, len, cn);
 
     if( !mask )
     {
@@ -224,7 +682,7 @@ template<typename T, typename AT> void
 accW_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha )
 {
     AT a = (AT)alpha, b = 1 - a;
-    int i = 0;
+    int i = AccW_SIMD<T, AT>()(src, dst, mask, len, cn, a);
 
     if( !mask )
     {

From 6f05a250bef653d8398d5ee855f6cef43194c031 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 30 Sep 2014 11:32:57 -0700
Subject: [PATCH 22/40] optimization of cv::CLAHE (~3x)

---
 modules/imgproc/src/clahe.cpp | 65 ++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 24 deletions(-)

diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp
index c329148f2..1e5ecc3a7 100644
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -233,6 +233,31 @@ namespace
         CLAHE_Interpolation_Body(const cv::Mat& src, const cv::Mat& dst, const cv::Mat& lut, const cv::Size& tileSize, const int& tilesX, const int& tilesY) :
             src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY)
         {
+            buf.allocate(src.cols << 2);
+            ind1_p = (int *)buf;
+            ind2_p = ind1_p + src.cols;
+            xa_p = (float *)(ind2_p + src.cols);
+            xa1_p = xa_p + src.cols;
+
+            int lut_step = static_cast<int>(lut_.step / sizeof(T));
+            float inv_tw = 1.0f / tileSize_.width;
+
+            for (int x = 0; x < src.cols; ++x)
+            {
+                float txf = x * inv_tw - 0.5f;
+
+                int tx1 = cvFloor(txf);
+                int tx2 = tx1 + 1;
+
+                xa_p[x] = txf - tx1;
+                xa1_p[x] = 1.0f - xa_p[x];
+
+                tx1 = std::max(tx1, 0);
+                tx2 = std::min(tx2, tilesX_ - 1);
+
+                ind1_p[x] = tx1 * lut_step;
+                ind2_p[x] = tx2 * lut_step;
+            }
         }
 
         void operator ()(const cv::Range& range) const;
@@ -245,24 +270,28 @@ namespace
         cv::Size tileSize_;
         int tilesX_;
         int tilesY_;
+
+        cv::AutoBuffer<int> buf;
+        int * ind1_p, * ind2_p;
+        float * xa_p, * xa1_p;
     };
 
     template <class T>
     void CLAHE_Interpolation_Body<T>::operator ()(const cv::Range& range) const
     {
-        const size_t lut_step = lut_.step / sizeof(T);
+        float inv_th = 1.0f / tileSize_.height;
 
         for (int y = range.start; y < range.end; ++y)
         {
             const T* srcRow = src_.ptr<T>(y);
             T* dstRow = dst_.ptr<T>(y);
 
-            const float tyf = (static_cast<float>(y) / tileSize_.height) - 0.5f;
+            float tyf = y * inv_th - 0.5f;
 
             int ty1 = cvFloor(tyf);
             int ty2 = ty1 + 1;
 
-            const float ya = tyf - ty1;
+            float ya = tyf - ty1, ya1 = 1.0f - ya;
 
             ty1 = std::max(ty1, 0);
             ty2 = std::min(ty2, tilesY_ - 1);
@@ -272,27 +301,13 @@ namespace
 
             for (int x = 0; x < src_.cols; ++x)
             {
-                const float txf = (static_cast<float>(x) / tileSize_.width) - 0.5f;
+                int srcVal = srcRow[x];
 
-                int tx1 = cvFloor(txf);
-                int tx2 = tx1 + 1;
+                int ind1 = ind1_p[x] + srcVal;
+                int ind2 = ind2_p[x] + srcVal;
 
-                const float xa = txf - tx1;
-
-                tx1 = std::max(tx1, 0);
-                tx2 = std::min(tx2, tilesX_ - 1);
-
-                const int srcVal = srcRow[x];
-
-                const size_t ind1 = tx1 * lut_step + srcVal;
-                const size_t ind2 = tx2 * lut_step + srcVal;
-
-                float res = 0;
-
-                res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya));
-                res += lutPlane1[ind2] * ((xa) * (1.0f - ya));
-                res += lutPlane2[ind1] * ((1.0f - xa) * (ya));
-                res += lutPlane2[ind2] * ((xa) * (ya));
+                float res = (lutPlane1[ind1] * xa1_p[x] + lutPlane1[ind2] * xa_p[x]) * ya1 +
+                            (lutPlane2[ind1] * xa1_p[x] + lutPlane2[ind2] * xa_p[x]) * ya;
 
                 dstRow[x] = cv::saturate_cast<T>(res);
             }
@@ -403,7 +418,9 @@ namespace
             calcLutBody = cv::makePtr<CLAHE_CalcLut_Body<uchar, 256, 0> >(srcForLut, lut_, tileSize, tilesX_, clipLimit, lutScale);
         else if (_src.type() == CV_16UC1)
             calcLutBody = cv::makePtr<CLAHE_CalcLut_Body<ushort, 4096, 4> >(srcForLut, lut_, tileSize, tilesX_, clipLimit, lutScale);
-        CV_Assert(!calcLutBody.empty());
+        else
+            CV_Error( CV_StsBadArg, "Unsupported type" );
+
         cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), *calcLutBody);
 
         cv::Ptr<cv::ParallelLoopBody> interpolationBody;
@@ -411,7 +428,7 @@ namespace
             interpolationBody = cv::makePtr<CLAHE_Interpolation_Body<uchar> >(src, dst, lut_, tileSize, tilesX_, tilesY_);
         else if (_src.type() == CV_16UC1)
             interpolationBody = cv::makePtr<CLAHE_Interpolation_Body<ushort> >(src, dst, lut_, tileSize, tilesX_, tilesY_);
-        CV_Assert(!interpolationBody.empty());
+
         cv::parallel_for_(cv::Range(0, src.rows), *interpolationBody);
     }
 

From a54f6bb08bbd63c16449f685ce1e9cdf8f10ac13 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Sun, 5 Oct 2014 09:48:31 -0700
Subject: [PATCH 23/40] cv::cvtColor

---
 modules/imgproc/perf/perf_blur.cpp |    7 +-
 modules/imgproc/src/color.cpp      | 1353 +++++++++++++++++++++++++++-
 modules/imgproc/src/imgwarp.cpp    |    8 -
 3 files changed, 1358 insertions(+), 10 deletions(-)

diff --git a/modules/imgproc/perf/perf_blur.cpp b/modules/imgproc/perf/perf_blur.cpp
index a6e31d3a8..3fc953ef1 100644
--- a/modules/imgproc/perf/perf_blur.cpp
+++ b/modules/imgproc/perf/perf_blur.cpp
@@ -98,6 +98,11 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16,
     Size size = get<0>(GetParam());
     int type = get<1>(GetParam());
     BorderType btype = get<2>(GetParam());
+    double eps = 1e-3;
+
+#if CV_NEON
+    eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : eps;
+#endif
 
     Mat src(size, type);
     Mat dst(size, type);
@@ -106,7 +111,7 @@ PERF_TEST_P(Size_MatType_BorderType, blur16x16,
 
     TEST_CYCLE() blur(src, dst, Size(16,16), Point(-1,-1), btype);
 
-    SANITY_CHECK(dst, 1e-3);
+    SANITY_CHECK(dst, eps);
 }
 
 PERF_TEST_P(Size_MatType_BorderType3x3, box3x3,
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 78236e03c..27da7953b 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -769,7 +769,6 @@ template<typename _Tp> struct RGB2Gray
     float coeffs[3];
 };
 
-
 template<> struct RGB2Gray<uchar>
 {
     typedef uchar channel_type;
@@ -800,6 +799,166 @@ template<> struct RGB2Gray<uchar>
     int tab[256*3];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2Gray<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
+        srccn(_srccn)
+    {
+        static const int coeffs0[] = { R2Y, G2Y, B2Y };
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
+        if( blueIdx == 0 )
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_cb = vdup_n_u16(coeffs[0]);
+        v_cg = vdup_n_u16(coeffs[1]);
+        v_cr = vdup_n_u16(coeffs[2]);
+        v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
+
+        for ( ; i <= n - 8; i += 8, src += scn * 8)
+        {
+            uint16x8_t v_b, v_r, v_g;
+            if (scn == 3)
+            {
+                uint16x8x3_t v_src = vld3q_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+            else
+            {
+                uint16x8x4_t v_src = vld4q_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+
+            uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
+                                           vmull_u16(vget_low_u16(v_b), v_cb),
+                                                     vget_low_u16(v_g), v_cg),
+                                                     vget_low_u16(v_r), v_cr);
+            uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
+                                           vmull_u16(vget_high_u16(v_b), v_cb),
+                                                     vget_high_u16(v_g), v_cg),
+                                                     vget_high_u16(v_r), v_cr);
+
+            uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
+            uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
+
+            vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
+        }
+
+        for ( ; i <= n - 4; i += 4, src += scn * 4)
+        {
+            uint16x4_t v_b, v_r, v_g;
+            if (scn == 3)
+            {
+                uint16x4x3_t v_src = vld3_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+            else
+            {
+                uint16x4x4_t v_src = vld4_u16(src);
+                v_b = v_src.val[0];
+                v_g = v_src.val[1];
+                v_r = v_src.val[2];
+            }
+
+            uint32x4_t v_dst = vmlal_u16(vmlal_u16(
+                                         vmull_u16(v_b, v_cb),
+                                                   v_g, v_cg),
+                                                   v_r, v_cr);
+
+            vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
+        }
+
+        for( ; i < n; i++, src += scn)
+            dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
+    }
+
+    int srccn, coeffs[3];
+    uint16x4_t v_cb, v_cg, v_cr;
+    uint32x4_t v_delta;
+};
+
+template <>
+struct RGB2Gray<float>
+{
+    typedef float channel_type;
+
+    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
+        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
+        if(blueIdx == 0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_cb = vdupq_n_f32(coeffs[0]);
+        v_cg = vdupq_n_f32(coeffs[1]);
+        v_cr = vdupq_n_f32(coeffs[2]);
+    }
+
+    void operator()(const float * src, float * dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
+
+        if (scn == 3)
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                float32x4x3_t v_src = vld3q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+
+                v_src = vld3q_f32(src + scn * 4);
+                vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+
+            for ( ; i <= n - 4; i += 4, src += scn * 4)
+            {
+                float32x4x3_t v_src = vld3q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+        }
+        else
+        {
+            for ( ; i <= n - 8; i += 8, src += scn * 8)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+
+                v_src = vld4q_f32(src + scn * 4);
+                vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+
+            for ( ; i <= n - 4; i += 4, src += scn * 4)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
+            }
+        }
+
+        for ( ; i < n; i++, src += scn)
+            dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
+    }
+
+    int srccn;
+    float coeffs[3];
+    float32x4_t v_cb, v_cg, v_cr;
+};
+
+#else
 
 template<> struct RGB2Gray<ushort>
 {
@@ -823,6 +982,7 @@ template<> struct RGB2Gray<ushort>
     int coeffs[3];
 };
 
+#endif
 
 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
 
@@ -855,6 +1015,72 @@ template<typename _Tp> struct RGB2YCrCb_f
     float coeffs[5];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2YCrCb_f<float>
+{
+    typedef float channel_type;
+
+    RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
+        srccn(_srccn), blueIdx(_blueIdx)
+    {
+        static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        if(blueIdx==0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_c0 = vdupq_n_f32(coeffs[0]);
+        v_c1 = vdupq_n_f32(coeffs[1]);
+        v_c2 = vdupq_n_f32(coeffs[2]);
+        v_c3 = vdupq_n_f32(coeffs[3]);
+        v_c4 = vdupq_n_f32(coeffs[4]);
+        v_delta = vdupq_n_f32(ColorChannel<float>::half());
+    }
+
+    void operator()(const float * src, float * dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        const float delta = ColorChannel<float>::half();
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+        n *= 3;
+
+        if (scn == 3)
+            for ( ; i <= n - 12; i += 12, src += 12)
+            {
+                float32x4x3_t v_src = vld3q_f32(src), v_dst;
+                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+                v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
+                v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
+
+                vst3q_f32(dst + i, v_dst);
+            }
+        else
+            for ( ; i <= n - 12; i += 12, src += 16)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+                v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
+                v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
+
+                vst3q_f32(dst + i, v_dst);
+            }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
+            float Cr = (src[bidx^2] - Y)*C3 + delta;
+            float Cb = (src[bidx] - Y)*C4 + delta;
+            dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
+        }
+    }
+    int srccn, blueIdx;
+    float coeffs[5];
+    float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
+};
+
+#endif
 
 template<typename _Tp> struct RGB2YCrCb_i
 {
@@ -887,6 +1113,224 @@ template<typename _Tp> struct RGB2YCrCb_i
     int coeffs[5];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2YCrCb_i<uchar>
+{
+    typedef uchar channel_type;
+
+    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
+        : srccn(_srccn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        if (blueIdx==0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_c0 = vdup_n_s16(coeffs[0]);
+        v_c1 = vdup_n_s16(coeffs[1]);
+        v_c2 = vdup_n_s16(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_c4 = vdupq_n_s32(coeffs[4]);
+        v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
+        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+    }
+
+    void operator()(const uchar * src, uchar * dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+        int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, src += scn * 8)
+        {
+            uint8x8x3_t v_dst;
+            int16x8x3_t v_src16;
+
+            if (scn == 3)
+            {
+                uint8x8x3_t v_src = vld3_u8(src);
+                v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+                v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+                v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+            }
+            else
+            {
+                uint8x8x4_t v_src = vld4_u8(src);
+                v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+                v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+                v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+            }
+
+            int16x4x3_t v_src0;
+            v_src0.val[0] = vget_low_s16(v_src16.val[0]);
+            v_src0.val[1] = vget_low_s16(v_src16.val[1]);
+            v_src0.val[2] = vget_low_s16(v_src16.val[2]);
+
+            int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
+            int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
+            v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
+            int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
+            v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
+
+            v_src0.val[0] = vget_high_s16(v_src16.val[0]);
+            v_src0.val[1] = vget_high_s16(v_src16.val[1]);
+            v_src0.val[2] = vget_high_s16(v_src16.val[2]);
+
+            int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
+            int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
+            v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
+            int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
+            v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
+
+            v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
+            v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
+            v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
+
+            vst3_u8(dst + i, v_dst);
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
+            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
+            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
+            dst[i] = saturate_cast<uchar>(Y);
+            dst[i+1] = saturate_cast<uchar>(Cr);
+            dst[i+2] = saturate_cast<uchar>(Cb);
+        }
+    }
+    int srccn, blueIdx, coeffs[5];
+    int16x4_t v_c0, v_c1, v_c2;
+    int32x4_t v_c3, v_c4, v_delta, v_delta2;
+};
+
+template <>
+struct RGB2YCrCb_i<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
+        : srccn(_srccn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
+        if (blueIdx==0)
+            std::swap(coeffs[0], coeffs[2]);
+
+        v_c0 = vdupq_n_s32(coeffs[0]);
+        v_c1 = vdupq_n_s32(coeffs[1]);
+        v_c2 = vdupq_n_s32(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_c4 = vdupq_n_s32(coeffs[4]);
+        v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
+        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+    }
+
+    void operator()(const ushort * src, ushort * dst, int n) const
+    {
+        int scn = srccn, bidx = blueIdx, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
+        int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, src += scn * 8)
+        {
+            uint16x8x3_t v_src, v_dst;
+            int32x4x3_t v_src0;
+
+            if (scn == 3)
+                v_src = vld3q_u16(src);
+            else
+            {
+                uint16x8x4_t v_src_ = vld4q_u16(src);
+                v_src.val[0] = v_src_.val[0];
+                v_src.val[1] = v_src_.val[1];
+                v_src.val[2] = v_src_.val[2];
+            }
+
+            v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
+            v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
+            v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+
+            int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
+            int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
+            v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
+            int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
+            v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
+
+            v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
+            v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
+            v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+
+            int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
+            int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
+            v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
+            int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
+            v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
+
+            v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
+            v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
+            v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
+
+            vst3q_u16(dst + i, v_dst);
+        }
+
+        for ( ; i <= n - 12; i += 12, src += scn * 4)
+        {
+            uint16x4x3_t v_dst;
+            int32x4x3_t v_src0;
+
+            if (scn == 3)
+            {
+                uint16x4x3_t v_src = vld3_u16(src);
+                v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
+                v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
+                v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+            }
+            else
+            {
+                uint16x4x4_t v_src = vld4_u16(src);
+                v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
+                v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
+                v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+            }
+
+            int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
+            v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
+            int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
+            v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
+            int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
+            v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
+
+            v_dst.val[0] = vqmovun_s32(v_Y);
+            v_dst.val[1] = vqmovun_s32(v_Cr);
+            v_dst.val[2] = vqmovun_s32(v_Cb);
+
+            vst3_u16(dst + i, v_dst);
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
+            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
+            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
+            dst[i] = saturate_cast<ushort>(Y);
+            dst[i+1] = saturate_cast<ushort>(Cr);
+            dst[i+2] = saturate_cast<ushort>(Cb);
+        }
+    }
+    int srccn, blueIdx, coeffs[5];
+    int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
+};
+
+#endif
 
 template<typename _Tp> struct YCrCb2RGB_f
 {
@@ -923,6 +1367,80 @@ template<typename _Tp> struct YCrCb2RGB_f
     float coeffs[4];
 };
 
+#if CV_NEON
+
+template <>
+struct YCrCb2RGB_f<float>
+{
+    typedef float channel_type;
+
+    YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
+        : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+        v_c0 = vdupq_n_f32(coeffs[0]);
+        v_c1 = vdupq_n_f32(coeffs[1]);
+        v_c2 = vdupq_n_f32(coeffs[2]);
+        v_c3 = vdupq_n_f32(coeffs[3]);
+        v_delta = vdupq_n_f32(ColorChannel<float>::half());
+        v_alpha = vdupq_n_f32(ColorChannel<float>::max());
+    }
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int dcn = dstcn, bidx = blueIdx, i = 0;
+        const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+        n *= 3;
+
+        if (dcn == 3)
+            for ( ; i <= n - 12; i += 12, dst += 12)
+            {
+                float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
+                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
+
+                v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
+                v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
+                v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
+
+                vst3q_f32(dst, v_dst);
+            }
+        else
+            for ( ; i <= n - 12; i += 12, dst += 16)
+            {
+                float32x4x3_t v_src = vld3q_f32(src + i);
+                float32x4x4_t v_dst;
+                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
+
+                v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
+                v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
+                v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
+                v_dst.val[3] = v_alpha;
+
+                vst4q_f32(dst, v_dst);
+            }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            float Y = src[i], Cr = src[i+1], Cb = src[i+2];
+
+            float b = Y + (Cb - delta)*C3;
+            float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
+            float r = Y + (Cr - delta)*C0;
+
+            dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    float coeffs[4];
+    float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
+};
+
+#endif
 
 template<typename _Tp> struct YCrCb2RGB_i
 {
@@ -962,6 +1480,254 @@ template<typename _Tp> struct YCrCb2RGB_i
     int coeffs[4];
 };
 
+#if CV_NEON
+
+template <>
+struct YCrCb2RGB_i<uchar>
+{
+    typedef uchar channel_type;
+
+    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+        : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {22987, -11698, -5636, 29049};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+        v_c0 = vdupq_n_s32(coeffs[0]);
+        v_c1 = vdupq_n_s32(coeffs[1]);
+        v_c2 = vdupq_n_s32(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_delta = vdup_n_s16(ColorChannel<uchar>::half());
+        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int dcn = dstcn, bidx = blueIdx, i = 0;
+        const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        {
+            uint8x8x3_t v_src = vld3_u8(src + i);
+            int16x8x3_t v_src16;
+            v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+            v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+            v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+
+            int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
+                      v_Cr = vget_low_s16(v_src16.val[1]),
+                      v_Cb = vget_low_s16(v_src16.val[2]);
+
+            int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
+            v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
+            v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
+            v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
+
+            v_Y = vget_high_s16(v_src16.val[0]);
+            v_Cr = vget_high_s16(v_src16.val[1]);
+            v_Cb = vget_high_s16(v_src16.val[2]);
+
+            int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
+            v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
+            v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
+            v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
+
+            uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1)));
+            uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1)));
+            uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1)));
+
+            if (dcn == 3)
+            {
+                uint8x8x3_t v_dst;
+                v_dst.val[bidx] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[bidx^2] = v_r;
+                vst3_u8(dst, v_dst);
+            }
+            else
+            {
+                uint8x8x4_t v_dst;
+                v_dst.val[bidx] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[bidx^2] = v_r;
+                v_dst.val[3] = v_alpha;
+                vst4_u8(dst, v_dst);
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            uchar Y = src[i];
+            uchar Cr = src[i+1];
+            uchar Cb = src[i+2];
+
+            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
+            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
+            int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
+
+            dst[bidx] = saturate_cast<uchar>(b);
+            dst[1] = saturate_cast<uchar>(g);
+            dst[bidx^2] = saturate_cast<uchar>(r);
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    int coeffs[4];
+
+    int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
+    int16x4_t v_delta;
+    uint8x8_t v_alpha;
+};
+
+template <>
+struct YCrCb2RGB_i<ushort>
+{
+    typedef ushort channel_type;
+
+    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+        : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] = {22987, -11698, -5636, 29049};
+        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
+
+        v_c0 = vdupq_n_s32(coeffs[0]);
+        v_c1 = vdupq_n_s32(coeffs[1]);
+        v_c2 = vdupq_n_s32(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_delta = vdupq_n_s32(ColorChannel<ushort>::half());
+        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
+        v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
+        v_alpha2 = vget_low_u16(v_alpha);
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int dcn = dstcn, bidx = blueIdx, i = 0;
+        const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        {
+            uint16x8x3_t v_src = vld3q_u16(src + i);
+
+            int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
+                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
+                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+
+            int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
+            v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
+            v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
+            v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
+
+            v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
+            v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
+            v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+
+            int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
+            v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
+            v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
+            v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
+
+            uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1));
+            uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1));
+            uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1));
+
+            if (dcn == 3)
+            {
+                uint16x8x3_t v_dst;
+                v_dst.val[bidx] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[bidx^2] = v_r;
+                vst3q_u16(dst, v_dst);
+            }
+            else
+            {
+                uint16x8x4_t v_dst;
+                v_dst.val[bidx] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[bidx^2] = v_r;
+                v_dst.val[3] = v_alpha;
+                vst4q_u16(dst, v_dst);
+            }
+        }
+
+        for ( ; i <= n - 12; i += 12, dst += dcn * 4)
+        {
+            uint16x4x3_t v_src = vld3_u16(src + i);
+
+            int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
+                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
+                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+
+            int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
+            v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
+            v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y);
+            int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0);
+            v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y);
+
+            uint16x4_t v_bd = vqmovun_s32(v_b);
+            uint16x4_t v_gd = vqmovun_s32(v_g);
+            uint16x4_t v_rd = vqmovun_s32(v_r);
+
+            if (dcn == 3)
+            {
+                uint16x4x3_t v_dst;
+                v_dst.val[bidx] = v_bd;
+                v_dst.val[1] = v_gd;
+                v_dst.val[bidx^2] = v_rd;
+                vst3_u16(dst, v_dst);
+            }
+            else
+            {
+                uint16x4x4_t v_dst;
+                v_dst.val[bidx] = v_bd;
+                v_dst.val[1] = v_gd;
+                v_dst.val[bidx^2] = v_rd;
+                v_dst.val[3] = v_alpha2;
+                vst4_u16(dst, v_dst);
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            ushort Y = src[i];
+            ushort Cr = src[i+1];
+            ushort Cb = src[i+2];
+
+            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
+            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
+            int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
+
+            dst[bidx] = saturate_cast<ushort>(b);
+            dst[1] = saturate_cast<ushort>(g);
+            dst[bidx^2] = saturate_cast<ushort>(r);
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    int coeffs[4];
+
+    int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
+    uint16x8_t v_alpha;
+    uint16x4_t v_alpha2;
+};
+
+#endif
 
 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
 
@@ -1013,6 +1779,78 @@ template<typename _Tp> struct RGB2XYZ_f
     float coeffs[9];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2XYZ_f<float>
+{
+    typedef float channel_type;
+
+    RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[2]);
+            std::swap(coeffs[3], coeffs[5]);
+            std::swap(coeffs[6], coeffs[8]);
+        }
+
+        v_c0 = vdupq_n_f32(coeffs[0]);
+        v_c1 = vdupq_n_f32(coeffs[1]);
+        v_c2 = vdupq_n_f32(coeffs[2]);
+        v_c3 = vdupq_n_f32(coeffs[3]);
+        v_c4 = vdupq_n_f32(coeffs[4]);
+        v_c5 = vdupq_n_f32(coeffs[5]);
+        v_c6 = vdupq_n_f32(coeffs[6]);
+        v_c7 = vdupq_n_f32(coeffs[7]);
+        v_c8 = vdupq_n_f32(coeffs[8]);
+    }
+
+    void operator()(const float* src, float* dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+        n *= 3;
+
+        if (scn == 3)
+            for ( ; i <= n - 12; i += 12, src += 12)
+            {
+                float32x4x3_t v_src = vld3q_f32(src), v_dst;
+                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+                v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
+                v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
+                vst3q_f32(dst + i, v_dst);
+            }
+        else
+            for ( ; i <= n - 12; i += 12, src += 16)
+            {
+                float32x4x4_t v_src = vld4q_f32(src);
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
+                v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
+                v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
+                vst3q_f32(dst + i, v_dst);
+            }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
+            float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
+            float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
+            dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
+        }
+    }
+
+    int srccn;
+    float coeffs[9];
+    float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+};
+
+#endif
 
 template<typename _Tp> struct RGB2XYZ_i
 {
@@ -1055,6 +1893,247 @@ template<typename _Tp> struct RGB2XYZ_i
     int coeffs[9];
 };
 
+#if CV_NEON
+
+template <>
+struct RGB2XYZ_i<uchar>
+{
+    typedef uchar channel_type;
+
+    RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const int coeffs0[] =
+        {
+            1689,    1465,    739,
+            871,     2929,    296,
+            79,      488,     3892
+        };
+        for( int i = 0; i < 9; i++ )
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[2]);
+            std::swap(coeffs[3], coeffs[5]);
+            std::swap(coeffs[6], coeffs[8]);
+        }
+
+        v_c0 = vdup_n_u16(coeffs[0]);
+        v_c1 = vdup_n_u16(coeffs[1]);
+        v_c2 = vdup_n_u16(coeffs[2]);
+        v_c3 = vdup_n_u16(coeffs[3]);
+        v_c4 = vdup_n_u16(coeffs[4]);
+        v_c5 = vdup_n_u16(coeffs[5]);
+        v_c6 = vdup_n_u16(coeffs[6]);
+        v_c7 = vdup_n_u16(coeffs[7]);
+        v_c8 = vdup_n_u16(coeffs[8]);
+        v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
+    }
+    void operator()(const uchar * src, uchar * dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, src += scn * 8)
+        {
+            uint8x8x3_t v_dst;
+            uint16x8x3_t v_src16;
+
+            if (scn == 3)
+            {
+                uint8x8x3_t v_src = vld3_u8(src);
+                v_src16.val[0] = vmovl_u8(v_src.val[0]);
+                v_src16.val[1] = vmovl_u8(v_src.val[1]);
+                v_src16.val[2] = vmovl_u8(v_src.val[2]);
+            }
+            else
+            {
+                uint8x8x4_t v_src = vld4_u8(src);
+                v_src16.val[0] = vmovl_u8(v_src.val[0]);
+                v_src16.val[1] = vmovl_u8(v_src.val[1]);
+                v_src16.val[2] = vmovl_u8(v_src.val[2]);
+            }
+
+            uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
+                       v_s1 = vget_low_u16(v_src16.val[1]),
+                       v_s2 = vget_low_u16(v_src16.val[2]);
+
+            uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
+            v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
+            v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
+
+            v_s0 = vget_high_u16(v_src16.val[0]),
+            v_s1 = vget_high_u16(v_src16.val[1]),
+            v_s2 = vget_high_u16(v_src16.val[2]);
+
+            uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
+            v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
+            v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
+
+            v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
+            v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
+            v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
+
+            vst3_u8(dst + i, v_dst);
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
+            int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
+            int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
+            dst[i] = saturate_cast<uchar>(X);
+            dst[i+1] = saturate_cast<uchar>(Y);
+            dst[i+2] = saturate_cast<uchar>(Z);
+        }
+    }
+
+    int srccn, coeffs[9];
+    uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+    uint32x4_t v_delta;
+};
+
+template <>
+struct RGB2XYZ_i<ushort>
+{
+    typedef ushort channel_type;
+
+    RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
+    {
+        static const int coeffs0[] =
+        {
+            1689,    1465,    739,
+            871,     2929,    296,
+            79,      488,     3892
+        };
+        for( int i = 0; i < 9; i++ )
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[2]);
+            std::swap(coeffs[3], coeffs[5]);
+            std::swap(coeffs[6], coeffs[8]);
+        }
+
+        v_c0 = vdup_n_u16(coeffs[0]);
+        v_c1 = vdup_n_u16(coeffs[1]);
+        v_c2 = vdup_n_u16(coeffs[2]);
+        v_c3 = vdup_n_u16(coeffs[3]);
+        v_c4 = vdup_n_u16(coeffs[4]);
+        v_c5 = vdup_n_u16(coeffs[5]);
+        v_c6 = vdup_n_u16(coeffs[6]);
+        v_c7 = vdup_n_u16(coeffs[7]);
+        v_c8 = vdup_n_u16(coeffs[8]);
+        v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
+    }
+
+    void operator()(const ushort * src, ushort * dst, int n) const
+    {
+        int scn = srccn, i = 0;
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, src += scn * 8)
+        {
+            uint16x8x3_t v_src, v_dst;
+
+            if (scn == 3)
+                v_src = vld3q_u16(src);
+            else
+            {
+                uint16x8x4_t v_src4 = vld4q_u16(src);
+                v_src.val[0] = v_src4.val[0];
+                v_src.val[1] = v_src4.val[1];
+                v_src.val[2] = v_src4.val[2];
+            }
+
+            uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
+                       v_s1 = vget_low_u16(v_src.val[1]),
+                       v_s2 = vget_low_u16(v_src.val[2]);
+
+            uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
+            v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
+            v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
+
+            v_s0 = vget_high_u16(v_src.val[0]),
+            v_s1 = vget_high_u16(v_src.val[1]),
+            v_s2 = vget_high_u16(v_src.val[2]);
+
+            uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
+            v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
+            v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
+
+            v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
+            v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
+            v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
+
+            vst3q_u16(dst + i, v_dst);
+        }
+
+        for ( ; i <= n - 12; i += 12, src += scn * 4)
+        {
+            uint16x4x3_t v_dst;
+            uint16x4_t v_s0, v_s1, v_s2;
+
+            if (scn == 3)
+            {
+                uint16x4x3_t v_src = vld3_u16(src);
+                v_s0 = v_src.val[0];
+                v_s1 = v_src.val[1];
+                v_s2 = v_src.val[2];
+            }
+            else
+            {
+                uint16x4x4_t v_src = vld4_u16(src);
+                v_s0 = v_src.val[0];
+                v_s1 = v_src.val[1];
+                v_s2 = v_src.val[2];
+            }
+
+            uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+
+            v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
+            v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
+            v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
+
+            vst3_u16(dst + i, v_dst);
+        }
+
+        for ( ; i < n; i += 3, src += scn)
+        {
+            int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
+            int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
+            int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
+            dst[i] = saturate_cast<ushort>(X);
+            dst[i+1] = saturate_cast<ushort>(Y);
+            dst[i+2] = saturate_cast<ushort>(Z);
+        }
+    }
+
+    int srccn, coeffs[9];
+    uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+    uint32x4_t v_delta;
+};
+
+#endif
 
 template<typename _Tp> struct XYZ2RGB_f
 {
@@ -1141,6 +2220,278 @@ template<typename _Tp> struct XYZ2RGB_i
     int coeffs[9];
 };
 
+#if CV_NEON
+
+template <>
+struct XYZ2RGB_i<uchar>
+{
+    typedef uchar channel_type;
+
+    XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+    : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] =
+        {
+            13273,  -6296,  -2042,
+            -3970,   7684,    170,
+              228,   -836,   4331
+        };
+        for(int i = 0; i < 9; i++)
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[6]);
+            std::swap(coeffs[1], coeffs[7]);
+            std::swap(coeffs[2], coeffs[8]);
+        }
+
+        v_c0 = vdup_n_s16(coeffs[0]);
+        v_c1 = vdup_n_s16(coeffs[1]);
+        v_c2 = vdup_n_s16(coeffs[2]);
+        v_c3 = vdup_n_s16(coeffs[3]);
+        v_c4 = vdup_n_s16(coeffs[4]);
+        v_c5 = vdup_n_s16(coeffs[5]);
+        v_c6 = vdup_n_s16(coeffs[6]);
+        v_c7 = vdup_n_s16(coeffs[7]);
+        v_c8 = vdup_n_s16(coeffs[8]);
+        v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
+        v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
+    }
+
+    void operator()(const uchar* src, uchar* dst, int n) const
+    {
+        int dcn = dstcn, i = 0;
+        uchar alpha = ColorChannel<uchar>::max();
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        {
+            uint8x8x3_t v_src = vld3_u8(src + i);
+            int16x8x3_t v_src16;
+            v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
+            v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
+            v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
+
+            int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
+                       v_s1 = vget_low_s16(v_src16.val[1]),
+                       v_s2 = vget_low_s16(v_src16.val[2]);
+
+            int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
+            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
+            v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
+
+            v_s0 = vget_high_s16(v_src16.val[0]),
+            v_s1 = vget_high_s16(v_src16.val[1]),
+            v_s2 = vget_high_s16(v_src16.val[2]);
+
+            int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
+            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
+            v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
+
+            uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
+            uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
+            uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
+
+            if (dcn == 3)
+            {
+                uint8x8x3_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                vst3_u8(dst, v_dst);
+            }
+            else
+            {
+                uint8x8x4_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                v_dst.val[3] = v_alpha;
+                vst4_u8(dst, v_dst);
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
+            int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
+            int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
+            dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
+            dst[2] = saturate_cast<uchar>(R);
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    int coeffs[9];
+
+    int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
+    uint8x8_t v_alpha;
+    int32x4_t v_delta;
+};
+
+template <>
+struct XYZ2RGB_i<ushort>
+{
+    typedef ushort channel_type;
+
+    XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
+    : dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        static const int coeffs0[] =
+        {
+            13273,  -6296,  -2042,
+            -3970,   7684,    170,
+              228,   -836,   4331
+        };
+        for(int i = 0; i < 9; i++)
+            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
+
+        if(blueIdx == 0)
+        {
+            std::swap(coeffs[0], coeffs[6]);
+            std::swap(coeffs[1], coeffs[7]);
+            std::swap(coeffs[2], coeffs[8]);
+        }
+
+        v_c0 = vdupq_n_s32(coeffs[0]);
+        v_c1 = vdupq_n_s32(coeffs[1]);
+        v_c2 = vdupq_n_s32(coeffs[2]);
+        v_c3 = vdupq_n_s32(coeffs[3]);
+        v_c4 = vdupq_n_s32(coeffs[4]);
+        v_c5 = vdupq_n_s32(coeffs[5]);
+        v_c6 = vdupq_n_s32(coeffs[6]);
+        v_c7 = vdupq_n_s32(coeffs[7]);
+        v_c8 = vdupq_n_s32(coeffs[8]);
+        v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
+        v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
+        v_alpha2 = vget_low_u16(v_alpha);
+    }
+
+    void operator()(const ushort* src, ushort* dst, int n) const
+    {
+        int dcn = dstcn, i = 0;
+        ushort alpha = ColorChannel<ushort>::max();
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+        n *= 3;
+
+        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
+        {
+            uint16x8x3_t v_src = vld3q_u16(src + i);
+            int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
+                      v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
+                      v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
+
+            int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
+            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
+            v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
+
+            v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
+            v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
+            v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
+
+            int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
+            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
+            v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
+
+            uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
+            uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
+            uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
+
+            if (dcn == 3)
+            {
+                uint16x8x3_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                vst3q_u16(dst, v_dst);
+            }
+            else
+            {
+                uint16x8x4_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                v_dst.val[3] = v_alpha;
+                vst4q_u16(dst, v_dst);
+            }
+        }
+
+        for ( ; i <= n - 12; i += 12, dst += dcn * 4)
+        {
+            uint16x4x3_t v_src = vld3_u16(src + i);
+            int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
+                      v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
+                      v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
+
+            int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
+            int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
+            int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
+            v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
+            v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
+            v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
+
+            uint16x4_t v_b = vqmovun_s32(v_X);
+            uint16x4_t v_g = vqmovun_s32(v_Y);
+            uint16x4_t v_r = vqmovun_s32(v_Z);
+
+            if (dcn == 3)
+            {
+                uint16x4x3_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                vst3_u16(dst, v_dst);
+            }
+            else
+            {
+                uint16x4x4_t v_dst;
+                v_dst.val[0] = v_b;
+                v_dst.val[1] = v_g;
+                v_dst.val[2] = v_r;
+                v_dst.val[3] = v_alpha2;
+                vst4_u16(dst, v_dst);
+            }
+        }
+
+        for ( ; i < n; i += 3, dst += dcn)
+        {
+            int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
+            int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
+            int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
+            dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
+            dst[2] = saturate_cast<ushort>(R);
+            if( dcn == 4 )
+                dst[3] = alpha;
+        }
+    }
+    int dstcn, blueIdx;
+    int coeffs[9];
+
+    int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
+    uint16x4_t v_alpha2;
+    uint16x8_t v_alpha;
+};
+
+#endif
 
 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
 
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 861b7ae3a..f4c2cf287 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -3580,14 +3580,6 @@ public:
                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
                                 }
                             }
-                        #elif CV_NEON
-                            for( ; x1 <= bcols - 4; x1 += 4 )
-                            {
-                                int32x4_t v_sx = cv_vrndq_s32_f32(vld1q_f32(sX + x1)),
-                                          v_sy = cv_vrndq_s32_f32(vld1q_f32(sY + x1));
-                                int16x4x2_t v_dst = vzip_s16(vqmovn_s32(v_sx), vqmovn_s32(v_sy));
-                                vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
-                            }
                         #endif
 
                             for( ; x1 < bcols; x1++ )

From 28ee84b7310f290927ef070be954ebdde10b359a Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 6 Oct 2014 11:18:33 -0700
Subject: [PATCH 24/40] cv::cvtColor (BGR[A]2RGB[A])

---
 modules/imgproc/src/color.cpp | 137 ++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 27da7953b..2faec5daf 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -580,6 +580,143 @@ template<typename _Tp> struct RGB2RGB
     int srccn, dstcn, blueIdx;
 };
 
+#if CV_NEON
+
+template<> struct RGB2RGB<uchar>
+{
+    typedef uchar channel_type;
+
+    RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
+        srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
+    {
+        v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
+        v_alpha2 = vget_low_u8(v_alpha);
+    }
+
+    void operator()(const uchar * src, uchar * dst, int n) const
+    {
+        int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
+        if (dcn == 3)
+        {
+            n *= 3;
+            if (scn == 3)
+            {
+                for ( ; i <= n - 48; i += 48, src += 48 )
+                {
+                    uint8x16x3_t v_src = vld3q_u8(src), v_dst;
+                    v_dst.val[0] = v_src.val[bidx];
+                    v_dst.val[1] = v_src.val[1];
+                    v_dst.val[2] = v_src.val[bidx ^ 2];
+                    vst3q_u8(dst + i, v_dst);
+                }
+                for ( ; i <= n - 24; i += 24, src += 24 )
+                {
+                    uint8x8x3_t v_src = vld3_u8(src), v_dst;
+                    v_dst.val[0] = v_src.val[bidx];
+                    v_dst.val[1] = v_src.val[1];
+                    v_dst.val[2] = v_src.val[bidx ^ 2];
+                    vst3_u8(dst + i, v_dst);
+                }
+                for ( ; i < n; i += 3, src += 3 )
+                {
+                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
+                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
+                }
+            }
+            else
+            {
+                for ( ; i <= n - 48; i += 48, src += 64 )
+                {
+                    uint8x16x4_t v_src = vld4q_u8(src);
+                    uint8x16x3_t v_dst;
+                    v_dst.val[0] = v_src.val[bidx];
+                    v_dst.val[1] = v_src.val[1];
+                    v_dst.val[2] = v_src.val[bidx ^ 2];
+                    vst3q_u8(dst + i, v_dst);
+                }
+                for ( ; i <= n - 24; i += 24, src += 32 )
+                {
+                    uint8x8x4_t v_src = vld4_u8(src);
+                    uint8x8x3_t v_dst;
+                    v_dst.val[0] = v_src.val[bidx];
+                    v_dst.val[1] = v_src.val[1];
+                    v_dst.val[2] = v_src.val[bidx ^ 2];
+                    vst3_u8(dst + i, v_dst);
+                }
+                for ( ; i < n; i += 3, src += 4 )
+                {
+                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
+                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
+                }
+            }
+        }
+        else if (scn == 3)
+        {
+            n *= 3;
+            for ( ; i <= n - 48; i += 48, dst += 64 )
+            {
+                uint8x16x3_t v_src = vld3q_u8(src + i);
+                uint8x16x4_t v_dst;
+                v_dst.val[bidx] = v_src.val[0];
+                v_dst.val[1] = v_src.val[1];
+                v_dst.val[bidx ^ 2] = v_src.val[2];
+                v_dst.val[3] = v_alpha;
+                vst4q_u8(dst, v_dst);
+            }
+            for ( ; i <= n - 24; i += 24, dst += 32 )
+            {
+                uint8x8x3_t v_src = vld3_u8(src + i);
+                uint8x8x4_t v_dst;
+                v_dst.val[bidx] = v_src.val[0];
+                v_dst.val[1] = v_src.val[1];
+                v_dst.val[bidx ^ 2] = v_src.val[2];
+                v_dst.val[3] = v_alpha2;
+                vst4_u8(dst, v_dst);
+            }
+            uchar alpha = ColorChannel<uchar>::max();
+            for (; i < n; i += 3, dst += 4 )
+            {
+                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
+                dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
+            }
+        }
+        else
+        {
+            n *= 4;
+            for ( ; i <= n - 64; i += 64 )
+            {
+                uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
+                v_dst.val[0] = v_src.val[0];
+                v_dst.val[1] = v_src.val[1];
+                v_dst.val[2] = v_src.val[2];
+                v_dst.val[3] = v_src.val[3];
+                vst4q_u8(dst + i, v_dst);
+            }
+            for ( ; i <= n - 32; i += 32 )
+            {
+                uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
+                v_dst.val[0] = v_src.val[0];
+                v_dst.val[1] = v_src.val[1];
+                v_dst.val[2] = v_src.val[2];
+                v_dst.val[3] = v_src.val[3];
+                vst4_u8(dst + i, v_dst);
+            }
+            for ( ; i < n; i += 4)
+            {
+                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
+                dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
+            }
+        }
+    }
+
+    int srccn, dstcn, blueIdx;
+
+    uint8x16_t v_alpha;
+    uint8x8_t v_alpha2;
+};
+
+#endif
+
 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
 
 struct RGB5x52RGB

From b9f57bda0518d5a2a1fef9a42967999616be7f18 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 6 Oct 2014 11:55:12 -0700
Subject: [PATCH 25/40] cv::cvtColor (RGB5x52RGB)

---
 modules/imgproc/src/color.cpp | 82 +++++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 4 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 2faec5daf..5db6d1ed2 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -724,13 +724,51 @@ struct RGB5x52RGB
     typedef uchar channel_type;
 
     RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
-        : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits) {}
+        : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits)
+    {
+        #if CV_NEON
+        v_n3 = vdupq_n_u16(~3);
+        v_n7 = vdupq_n_u16(~7);
+        v_255 = vdupq_n_u8(255);
+        v_0 = vdupq_n_u8(0);
+        v_mask = vdupq_n_u16(0x8000);
+        #endif
+    }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
-        int dcn = dstcn, bidx = blueIdx;
+        int dcn = dstcn, bidx = blueIdx, i = 0;
         if( greenBits == 6 )
-            for( int i = 0; i < n; i++, dst += dcn )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 16; i += 16, dst += dcn * 16)
+            {
+                uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
+                uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
+                uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)),
+                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3)));
+                uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)),
+                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7)));
+                if (dcn == 3)
+                {
+                    uint8x16x3_t v_dst;
+                    v_dst.val[bidx] = v_b;
+                    v_dst.val[1] = v_g;
+                    v_dst.val[bidx^2] = v_r;
+                    vst3q_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x16x4_t v_dst;
+                    v_dst.val[bidx] = v_b;
+                    v_dst.val[1] = v_g;
+                    v_dst.val[bidx^2] = v_r;
+                    v_dst.val[3] = v_255;
+                    vst4q_u8(dst, v_dst);
+                }
+            }
+            #endif
+            for( ; i < n; i++, dst += dcn )
             {
                 unsigned t = ((const ushort*)src)[i];
                 dst[bidx] = (uchar)(t << 3);
@@ -739,8 +777,39 @@ struct RGB5x52RGB
                 if( dcn == 4 )
                     dst[3] = 255;
             }
+        }
         else
-            for( int i = 0; i < n; i++, dst += dcn )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 16; i += 16, dst += dcn * 16)
+            {
+                uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
+                uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
+                uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)),
+                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7)));
+                uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)),
+                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7)));
+                if (dcn == 3)
+                {
+                    uint8x16x3_t v_dst;
+                    v_dst.val[bidx] = v_b;
+                    v_dst.val[1] = v_g;
+                    v_dst.val[bidx^2] = v_r;
+                    vst3q_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x16x4_t v_dst;
+                    v_dst.val[bidx] = v_b;
+                    v_dst.val[1] = v_g;
+                    v_dst.val[bidx^2] = v_r;
+                    v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)),
+                                                        vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0);
+                    vst4q_u8(dst, v_dst);
+                }
+            }
+            #endif
+            for( ; i < n; i++, dst += dcn )
             {
                 unsigned t = ((const ushort*)src)[i];
                 dst[bidx] = (uchar)(t << 3);
@@ -749,9 +818,14 @@ struct RGB5x52RGB
                 if( dcn == 4 )
                     dst[3] = t & 0x8000 ? 255 : 0;
             }
+        }
     }
 
     int dstcn, blueIdx, greenBits;
+    #if CV_NEON
+    uint16x8_t v_n3, v_n7, v_mask;
+    uint8x16_t v_255, v_0;
+    #endif
 };
 
 

From a84c4ac197ebc0b9fc927286d52f56d7d3f9a6f7 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 6 Oct 2014 12:40:41 -0700
Subject: [PATCH 26/40] cv::cvtColor (RGBRGB5x5)

---
 modules/imgproc/src/color.cpp | 82 ++++++++++++++++++++++++++++++-----
 1 file changed, 72 insertions(+), 10 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 5db6d1ed2..c761d1ae4 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -834,30 +834,92 @@ struct RGB2RGB5x5
     typedef uchar channel_type;
 
     RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
-        : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits) {}
+        : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits)
+    {
+        #if CV_NEON
+        v_n3 = vdup_n_u8(~3);
+        v_n7 = vdup_n_u8(~7);
+        v_mask = vdupq_n_u16(0x8000);
+        v_0 = vdupq_n_u16(0);
+        v_full = vdupq_n_u16(0xffff);
+        #endif
+    }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
-        int scn = srccn, bidx = blueIdx;
-        if( greenBits == 6 )
-            for( int i = 0; i < n; i++, src += scn )
+        int scn = srccn, bidx = blueIdx, i = 0;
+        if (greenBits == 6)
+        {
+            if (scn == 3)
             {
-                ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
+                #if CV_NEON
+                for ( ; i <= n - 8; i += 8, src += 24 )
+                {
+                    uint8x8x3_t v_src = vld3_u8(src);
+                    uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
+                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
+                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
+                    vst1q_u16((ushort *)dst + i, v_dst);
+                }
+                #endif
+                for ( ; i < n; i++, src += 3 )
+                    ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
             }
-        else if( scn == 3 )
-            for( int i = 0; i < n; i++, src += 3 )
+            else
             {
+                #if CV_NEON
+                for ( ; i <= n - 8; i += 8, src += 32 )
+                {
+                    uint8x8x4_t v_src = vld4_u8(src);
+                    uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
+                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
+                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
+                    vst1q_u16((ushort *)dst + i, v_dst);
+                }
+                #endif
+                for ( ; i < n; i++, src += 4 )
+                    ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
+            }
+        }
+        else if (scn == 3)
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8, src += 24 )
+            {
+                uint8x8x3_t v_src = vld3_u8(src);
+                uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7));
+                vst1q_u16((ushort *)dst + i, v_dst);
+            }
+            #endif
+            for ( ; i < n; i++, src += 3 )
                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
-            }
+        }
         else
-            for( int i = 0; i < n; i++, src += 4 )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8, src += 32 )
             {
+                uint8x8x4_t v_src = vld4_u8(src);
+                uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
+                v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7),
+                                                   vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0)));
+                vst1q_u16((ushort *)dst + i, v_dst);
+            }
+            #endif
+            for ( ; i < n; i++, src += 4 )
                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
                     ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
-            }
+        }
     }
 
     int srccn, blueIdx, greenBits;
+    #if CV_NEON
+    uint8x8_t v_n3, v_n7;
+    uint16x8_t v_mask, v_0, v_full;
+    #endif
 };
 
 ///////////////////////////////// Color to/from Grayscale ////////////////////////////////

From f91e461ea097f31bff53be21be7306bae04615b2 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 6 Oct 2014 13:06:08 -0700
Subject: [PATCH 27/40] cv::cvtColor (RGB5x52Gray)

---
 modules/imgproc/src/color.cpp | 64 +++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index c761d1ae4..66547c8fb 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -995,27 +995,85 @@ struct RGB5x52Gray
 {
     typedef uchar channel_type;
 
-    RGB5x52Gray(int _greenBits) : greenBits(_greenBits) {}
+    RGB5x52Gray(int _greenBits) : greenBits(_greenBits)
+    {
+        #if CV_NEON
+        v_b2y = vdup_n_u16(B2Y);
+        v_g2y = vdup_n_u16(G2Y);
+        v_r2y = vdup_n_u16(R2Y);
+        v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
+        v_f8 = vdupq_n_u16(0xf8);
+        v_fc = vdupq_n_u16(0xfc);
+        #endif
+    }
+
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        int i = 0;
         if( greenBits == 6 )
-            for( int i = 0; i < n; i++ )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8)
+            {
+                uint16x8_t v_src = vld1q_u16((ushort *)src + i);
+                uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
+                           v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc),
+                           v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
+
+                uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
+                                              vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
+                uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
+                                              vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
+                v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
+                v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
+
+                vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
+            }
+            #endif
+            for ( ; i < n; i++)
             {
                 int t = ((ushort*)src)[i];
                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
                                            ((t >> 3) & 0xfc)*G2Y +
                                            ((t >> 8) & 0xf8)*R2Y, yuv_shift);
             }
+        }
         else
-            for( int i = 0; i < n; i++ )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8)
+            {
+                uint16x8_t v_src = vld1q_u16((ushort *)src + i);
+                uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
+                           v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8),
+                           v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
+
+                uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
+                                              vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
+                uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
+                                              vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
+                v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
+                v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
+
+                vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
+            }
+            #endif
+            for ( ; i < n; i++)
             {
                 int t = ((ushort*)src)[i];
                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
                                            ((t >> 2) & 0xf8)*G2Y +
                                            ((t >> 7) & 0xf8)*R2Y, yuv_shift);
             }
+        }
     }
     int greenBits;
+
+    #if CV_NEON
+    uint16x4_t v_b2y, v_g2y, v_r2y;
+    uint32x4_t v_delta;
+    uint16x8_t v_f8, v_fc;
+    #endif
 };
 
 

From eb3046f7d3f912b381c40d0854485660f1c437c5 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Mon, 6 Oct 2014 13:33:06 -0700
Subject: [PATCH 28/40] cv::cvtColor (Gray2RGB5x5)

---
 modules/imgproc/src/color.cpp | 50 +++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 66547c8fb..3d5cdedb7 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -686,18 +686,18 @@ template<> struct RGB2RGB<uchar>
             for ( ; i <= n - 64; i += 64 )
             {
                 uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
-                v_dst.val[0] = v_src.val[0];
+                v_dst.val[0] = v_src.val[2];
                 v_dst.val[1] = v_src.val[1];
-                v_dst.val[2] = v_src.val[2];
+                v_dst.val[2] = v_src.val[0];
                 v_dst.val[3] = v_src.val[3];
                 vst4q_u8(dst + i, v_dst);
             }
             for ( ; i <= n - 32; i += 32 )
             {
                 uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
-                v_dst.val[0] = v_src.val[0];
+                v_dst.val[0] = v_src.val[2];
                 v_dst.val[1] = v_src.val[1];
-                v_dst.val[2] = v_src.val[2];
+                v_dst.val[2] = v_src.val[0];
                 v_dst.val[3] = v_src.val[3];
                 vst4_u8(dst + i, v_dst);
             }
@@ -956,23 +956,57 @@ struct Gray2RGB5x5
 {
     typedef uchar channel_type;
 
-    Gray2RGB5x5(int _greenBits) : greenBits(_greenBits) {}
+    Gray2RGB5x5(int _greenBits) : greenBits(_greenBits)
+    {
+        #if CV_NEON
+        v_n7 = vdup_n_u8(~7);
+        v_n3 = vdup_n_u8(~3);
+        #endif
+    }
+
     void operator()(const uchar* src, uchar* dst, int n) const
     {
+        int i = 0;
         if( greenBits == 6 )
-            for( int i = 0; i < n; i++ )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8 )
+            {
+                uint8x8_t v_src = vld1_u8(src + i);
+                uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3));
+                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8));
+                vst1q_u16((ushort *)dst + i, v_dst);
+            }
+            #endif
+            for ( ; i < n; i++ )
             {
                 int t = src[i];
                 ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
             }
+        }
         else
-            for( int i = 0; i < n; i++ )
+        {
+            #if CV_NEON
+            for ( ; i <= n - 8; i += 8 )
+            {
+                uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3));
+                uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10));
+                vst1q_u16((ushort *)dst + i, v_dst);
+            }
+            #endif
+            for( ; i < n; i++ )
             {
                 int t = src[i] >> 3;
                 ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
             }
+        }
     }
     int greenBits;
+
+    #if CV_NEON
+    uint8x8_t v_n7, v_n3;
+    #endif
 };
 
 
@@ -1046,7 +1080,7 @@ struct RGB5x52Gray
                 uint16x8_t v_src = vld1q_u16((ushort *)src + i);
                 uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
                            v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8),
-                           v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
+                           v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8);
 
                 uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
                                               vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);

From 8d367d4b4d45fd75390abeda57a7e93ee53c127d Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 7 Oct 2014 15:25:02 +0000
Subject: [PATCH 29/40] cv::cvtColor (HSV2RGB CV_8U)

---
 modules/imgproc/src/color.cpp | 70 +++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 3 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 3d5cdedb7..6b2f9ffc0 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -3047,7 +3047,13 @@ struct HSV2RGB_b
 
     HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
-    {}
+    {
+        #if CV_NEON
+        v_scale_inv = vdupq_n_f32(1.f/255.f);
+        v_scale = vdupq_n_f32(255.f);
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #endif
+    }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
@@ -3058,8 +3064,30 @@ struct HSV2RGB_b
         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
+            j = 0;
 
-            for( j = 0; j < dn*3; j += 3 )
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24)
+            {
+                uint8x8x3_t v_src = vld3_u8(src + j);
+                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
+                           v_t1 = vmovl_u8(v_src.val[1]),
+                           v_t2 = vmovl_u8(v_src.val[2]);
+
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j, v_dst);
+
+                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j + 12, v_dst);
+            }
+            #endif
+
+            for( ; j < dn*3; j += 3 )
             {
                 buf[j] = src[j];
                 buf[j+1] = src[j+1]*(1.f/255.f);
@@ -3067,7 +3095,39 @@ struct HSV2RGB_b
             }
             cvt(buf, buf, dn);
 
-            for( j = 0; j < dn*3; j += 3, dst += dcn )
+            j = 0;
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
+            {
+                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
+                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
+                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
+                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
+
+                if (dcn == 4)
+                {
+                    uint8x8x4_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    v_dst.val[3] = v_alpha;
+                    vst4_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x8x3_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    vst3_u8(dst, v_dst);
+                }
+            }
+            #endif
+
+            for( ; j < dn*3; j += 3, dst += dcn )
             {
                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
@@ -3080,6 +3140,10 @@ struct HSV2RGB_b
 
     int dstcn;
     HSV2RGB_f cvt;
+    #if CV_NEON
+    float32x4_t v_scale, v_scale_inv;
+    uint8x8_t v_alpha;
+    #endif
 };
 
 

From c173d9fa183cc8c5687d84683ab5189cc22f580f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 7 Oct 2014 15:33:09 +0000
Subject: [PATCH 30/40] cv::cvtColor (RGB2HLS CV_8U)

---
 modules/imgproc/src/color.cpp | 66 +++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 6b2f9ffc0..975617fff 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -3208,7 +3208,14 @@ struct RGB2HLS_b
     typedef uchar channel_type;
 
     RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
-    : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange) {}
+    : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange)
+    {
+        #if CV_NEON
+        v_scale_inv = vdupq_n_f32(1.f/255.f);
+        v_scale = vdupq_n_f32(255.f);
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #endif
+    }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
@@ -3218,8 +3225,41 @@ struct RGB2HLS_b
         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
+            j = 0;
 
-            for( j = 0; j < dn*3; j += 3, src += scn )
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
+            {
+                uint16x8_t v_t0, v_t1, v_t2;
+
+                if (scn == 3)
+                {
+                    uint8x8x3_t v_src = vld3_u8(src);
+                    v_t0 = vmovl_u8(v_src.val[0]);
+                    v_t1 = vmovl_u8(v_src.val[1]);
+                    v_t2 = vmovl_u8(v_src.val[2]);
+                }
+                else
+                {
+                    uint8x8x4_t v_src = vld4_u8(src);
+                    v_t0 = vmovl_u8(v_src.val[0]);
+                    v_t1 = vmovl_u8(v_src.val[1]);
+                    v_t2 = vmovl_u8(v_src.val[2]);
+                }
+
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j, v_dst);
+
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j + 12, v_dst);
+            }
+            #endif
+            for( ; j < dn*3; j += 3, src += scn )
             {
                 buf[j] = src[0]*(1.f/255.f);
                 buf[j+1] = src[1]*(1.f/255.f);
@@ -3227,7 +3267,23 @@ struct RGB2HLS_b
             }
             cvt(buf, buf, dn);
 
-            for( j = 0; j < dn*3; j += 3 )
+            j = 0;
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24)
+            {
+                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
+
+                uint8x8x3_t v_dst;
+                v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])),
+                                                       vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0]))));
+                v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
+                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
+                v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
+                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
+                vst3_u8(dst + j, v_dst);
+            }
+            #endif
+            for( ; j < dn*3; j += 3 )
             {
                 dst[j] = saturate_cast<uchar>(buf[j]);
                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
@@ -3238,6 +3294,10 @@ struct RGB2HLS_b
 
     int srccn;
     RGB2HLS_f cvt;
+    #if CV_NEON
+    float32x4_t v_scale, v_scale_inv;
+    uint8x8_t v_alpha;
+    #endif
 };
 
 

From 71c3e67cb2a0c615955bc77a30c4cc1b38e22dcd Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 7 Oct 2014 15:37:21 +0000
Subject: [PATCH 31/40] cv::cvtColor (HLS2RGB CV_8U)

---
 modules/imgproc/src/color.cpp | 68 +++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 975617fff..67dcfa67b 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -3371,7 +3371,13 @@ struct HLS2RGB_b
 
     HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
-    {}
+    {
+        #if CV_NEON
+        v_scale_inv = vdupq_n_f32(1.f/255.f);
+        v_scale = vdupq_n_f32(255.f);
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #endif
+    }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
@@ -3382,8 +3388,29 @@ struct HLS2RGB_b
         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
+            j = 0;
 
-            for( j = 0; j < dn*3; j += 3 )
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24)
+            {
+                uint8x8x3_t v_src = vld3_u8(src + j);
+                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
+                           v_t1 = vmovl_u8(v_src.val[1]),
+                           v_t2 = vmovl_u8(v_src.val[2]);
+
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j, v_dst);
+
+                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j + 12, v_dst);
+            }
+            #endif
+            for( ; j < dn*3; j += 3 )
             {
                 buf[j] = src[j];
                 buf[j+1] = src[j+1]*(1.f/255.f);
@@ -3391,7 +3418,38 @@ struct HLS2RGB_b
             }
             cvt(buf, buf, dn);
 
-            for( j = 0; j < dn*3; j += 3, dst += dcn )
+            j = 0;
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
+            {
+                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
+                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
+                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
+                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
+
+                if (dcn == 4)
+                {
+                    uint8x8x4_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    v_dst.val[3] = v_alpha;
+                    vst4_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x8x3_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    vst3_u8(dst, v_dst);
+                }
+            }
+            #endif
+            for( ; j < dn*3; j += 3, dst += dcn )
             {
                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
@@ -3404,6 +3462,10 @@ struct HLS2RGB_b
 
     int dstcn;
     HLS2RGB_f cvt;
+    #if CV_NEON
+    float32x4_t v_scale, v_scale_inv;
+    uint8x8_t v_alpha;
+    #endif
 };
 
 

From 06461b401a0e464ce5de8eaa3ca2152afae0ef2d Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 7 Oct 2014 10:27:28 -0700
Subject: [PATCH 32/40] cv::cvtColor (Lab2BGR CV_8U)

---
 modules/imgproc/src/color.cpp | 72 +++++++++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 3 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 67dcfa67b..e34c67b45 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -3773,7 +3773,14 @@ struct Lab2RGB_b
 
     Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
                const float* _whitept, bool _srgb )
-    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {}
+    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
+    {
+        #if CV_NEON
+        v_scale_inv = vdupq_n_f32(100.f/255.f);
+        v_scale = vdupq_n_f32(255.f);
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #endif
+    }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
@@ -3784,16 +3791,70 @@ struct Lab2RGB_b
         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
+            j = 0;
 
-            for( j = 0; j < dn*3; j += 3 )
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24)
+            {
+                uint8x8x3_t v_src = vld3_u8(src + j);
+                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
+                           v_t1 = vmovl_u8(v_src.val[1]),
+                           v_t2 = vmovl_u8(v_src.val[2]);
+
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_128);
+                v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_128);
+                vst3q_f32(buf + j, v_dst);
+
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_128);
+                v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128);
+                vst3q_f32(buf + j + 12, v_dst);
+            }
+            #endif
+
+            for( ; j < dn*3; j += 3 )
             {
                 buf[j] = src[j]*(100.f/255.f);
                 buf[j+1] = (float)(src[j+1] - 128);
                 buf[j+2] = (float)(src[j+2] - 128);
             }
             cvt(buf, buf, dn);
+            j = 0;
 
-            for( j = 0; j < dn*3; j += 3, dst += dcn )
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
+            {
+                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
+                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
+                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
+                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
+
+                if (dcn == 4)
+                {
+                    uint8x8x4_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    v_dst.val[3] = v_alpha;
+                    vst4_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x8x3_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    vst3_u8(dst, v_dst);
+                }
+            }
+            #endif
+
+            for( ; j < dn*3; j += 3, dst += dcn )
             {
                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
@@ -3806,6 +3867,11 @@ struct Lab2RGB_b
 
     int dstcn;
     Lab2RGB_f cvt;
+
+    #if CV_NEON
+    float32x4_t v_scale, v_scale_inv, v_128;
+    uint8x8_t v_alpha;
+    #endif
 };
 
 

From 1c1c41d3be35041072ac649a6c74dcb89802b98a Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 7 Oct 2014 13:35:11 -0700
Subject: [PATCH 33/40] cv::cvtColor (Luv2RGB CV_8U)

---
 modules/imgproc/src/color.cpp | 75 +++++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 3 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index e34c67b45..f745f218a 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -4074,7 +4074,18 @@ struct Luv2RGB_b
 
     Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
                const float* _whitept, bool _srgb )
-    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {}
+    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
+    {
+        #if CV_NEON
+        v_scale_inv = vdupq_n_f32(100.f/255.f);
+        v_coeff1 = vdupq_n_f32(1.388235294117647f);
+        v_coeff2 = vdupq_n_f32(1.027450980392157f);
+        v_134 = vdupq_n_f32(134.f);
+        v_140 = vdupq_n_f32(140.f);
+        v_scale = vdupq_n_f32(255.f);
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #endif
+    }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
@@ -4085,8 +4096,29 @@ struct Luv2RGB_b
         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
+            j = 0;
 
-            for( j = 0; j < dn*3; j += 3 )
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24)
+            {
+                uint8x8x3_t v_src = vld3_u8(src + j);
+                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
+                           v_t1 = vmovl_u8(v_src.val[1]),
+                           v_t2 = vmovl_u8(v_src.val[2]);
+
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_coeff1), v_134);
+                v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_coeff2), v_140);
+                vst3q_f32(buf + j, v_dst);
+
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_coeff1), v_134);
+                v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140);
+                vst3q_f32(buf + j + 12, v_dst);
+            }
+            #endif
+            for( ; j < dn*3; j += 3 )
             {
                 buf[j] = src[j]*(100.f/255.f);
                 buf[j+1] = (float)(src[j+1]*1.388235294117647f - 134.f);
@@ -4094,7 +4126,39 @@ struct Luv2RGB_b
             }
             cvt(buf, buf, dn);
 
-            for( j = 0; j < dn*3; j += 3, dst += dcn )
+            j = 0;
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
+            {
+                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
+                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
+                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
+                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
+                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
+
+                if (dcn == 4)
+                {
+                    uint8x8x4_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    v_dst.val[3] = v_alpha;
+                    vst4_u8(dst, v_dst);
+                }
+                else
+                {
+                    uint8x8x3_t v_dst;
+                    v_dst.val[0] = v_dst0;
+                    v_dst.val[1] = v_dst1;
+                    v_dst.val[2] = v_dst2;
+                    vst3_u8(dst, v_dst);
+                }
+            }
+            #endif
+
+            for( ; j < dn*3; j += 3, dst += dcn )
             {
                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
@@ -4107,6 +4171,11 @@ struct Luv2RGB_b
 
     int dstcn;
     Luv2RGB_f cvt;
+
+    #if CV_NEON
+    float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
+    uint8x8_t v_alpha;
+    #endif
 };
 
 

From 7166b7b640bf260287683c805e65216951c1d6a9 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 7 Oct 2014 13:43:14 -0700
Subject: [PATCH 34/40] cv::cvtColor (RGB2Luv CV_8U)

---
 modules/imgproc/src/color.cpp | 74 +++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 3 deletions(-)

diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index f745f218a..9a0070173 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -3779,6 +3779,7 @@ struct Lab2RGB_b
         v_scale_inv = vdupq_n_f32(100.f/255.f);
         v_scale = vdupq_n_f32(255.f);
         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        v_128 = vdupq_n_f32(128.0f);
         #endif
     }
 
@@ -4035,7 +4036,18 @@ struct RGB2Luv_b
 
     RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,
                const float* _whitept, bool _srgb )
-    : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb) {}
+    : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb)
+    {
+        #if CV_NEON
+        v_scale_inv = vdupq_n_f32(1.f/255.f);
+        v_scale = vdupq_n_f32(2.55f);
+        v_coeff1 = vdupq_n_f32(0.72033898305084743f);
+        v_coeff2 = vdupq_n_f32(96.525423728813564f);
+        v_coeff3 = vdupq_n_f32(0.9732824427480916f);
+        v_coeff4 = vdupq_n_f32(136.259541984732824f);
+        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
+        #endif
+    }
 
     void operator()(const uchar* src, uchar* dst, int n) const
     {
@@ -4045,8 +4057,41 @@ struct RGB2Luv_b
         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
         {
             int dn = std::min(n - i, (int)BLOCK_SIZE);
+            j = 0;
 
-            for( j = 0; j < dn*3; j += 3, src += scn )
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
+            {
+                uint16x8_t v_t0, v_t1, v_t2;
+
+                if (scn == 3)
+                {
+                    uint8x8x3_t v_src = vld3_u8(src);
+                    v_t0 = vmovl_u8(v_src.val[0]);
+                    v_t1 = vmovl_u8(v_src.val[1]);
+                    v_t2 = vmovl_u8(v_src.val[2]);
+                }
+                else
+                {
+                    uint8x8x4_t v_src = vld4_u8(src);
+                    v_t0 = vmovl_u8(v_src.val[0]);
+                    v_t1 = vmovl_u8(v_src.val[1]);
+                    v_t2 = vmovl_u8(v_src.val[2]);
+                }
+
+                float32x4x3_t v_dst;
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j, v_dst);
+
+                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
+                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
+                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
+                vst3q_f32(buf + j + 12, v_dst);
+            }
+            #endif
+            for( ; j < dn*3; j += 3, src += scn )
             {
                 buf[j] = src[0]*(1.f/255.f);
                 buf[j+1] = (float)(src[1]*(1.f/255.f));
@@ -4054,7 +4099,25 @@ struct RGB2Luv_b
             }
             cvt(buf, buf, dn);
 
-            for( j = 0; j < dn*3; j += 3 )
+            j = 0;
+            #if CV_NEON
+            for ( ; j <= (dn - 8) * 3; j += 24)
+            {
+                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
+
+                uint8x8x3_t v_dst;
+                v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
+                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
+                v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[1], v_coeff1), v_coeff2))),
+                                                       vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[1], v_coeff1), v_coeff2)))));
+                v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[2], v_coeff3), v_coeff4))),
+                                                       vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[2], v_coeff3), v_coeff4)))));
+
+                vst3_u8(dst + j, v_dst);
+            }
+            #endif
+
+            for( ; j < dn*3; j += 3 )
             {
                 dst[j] = saturate_cast<uchar>(buf[j]*2.55f);
                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*0.72033898305084743f + 96.525423728813564f);
@@ -4065,6 +4128,11 @@ struct RGB2Luv_b
 
     int srccn;
     RGB2Luv_f cvt;
+
+    #if CV_NEON
+    float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
+    uint8x8_t v_alpha;
+    #endif
 };
 
 

From 48a9905f5d6014cf4a7234b1ab9f1734f13c852b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 8 Oct 2014 11:31:39 -0700
Subject: [PATCH 35/40] cv::pyrDown

---
 modules/imgproc/src/pyramids.cpp | 148 ++++++++++++++++++++++++++++++-
 1 file changed, 146 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 8eab60243..0154813f4 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -178,9 +178,153 @@ struct PyrDownVec_32f
     }
 };
 
+typedef NoVec<int, ushort> PyrDownVec_32s16u;
+typedef NoVec<int, short> PyrDownVec_32s16s;
+
+#elif CV_NEON
+
+struct PyrDownVec_32s8u
+{
+    int operator()(int** src, uchar* dst, int, int width) const
+    {
+        int x = 0;
+        const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1],
+                           *row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3],
+                           *row4 = (unsigned int*)src[4];
+        uint16x8_t v_delta = vdupq_n_u16(128);
+
+        for( ; x <= width - 16; x += 16 )
+        {
+            uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
+            uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
+            uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
+            uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4)));
+            uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4)));
+
+            v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2));
+            v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3);
+            uint16x8_t v_dst0 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
+
+            v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
+            v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
+            v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
+            v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12)));
+            v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12)));
+
+            v_r0 = vqaddq_u16(vqaddq_u16(v_r0, v_r4), vqaddq_u16(v_r2, v_r2));
+            v_r1 = vqaddq_u16(vqaddq_u16(v_r1, v_r2), v_r3);
+            uint16x8_t v_dst1 = vqaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
+
+            vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)),
+                                          vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8))));
+        }
+
+        return x;
+    }
+};
+
+struct PyrDownVec_32s16u
+{
+    int operator()(int** src, ushort* dst, int, int width) const
+    {
+        int x = 0;
+        const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+        int32x4_t v_delta = vdupq_n_s32(128);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
+            int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
+            int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
+            int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
+            int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
+
+            v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20));
+            v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30);
+            int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, vshlq_n_s32(v_r10, 2)), v_delta), 8);
+
+            v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21));
+            v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31);
+            int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, vshlq_n_s32(v_r11, 2)), v_delta), 8);
+
+            vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1)));
+        }
+
+        return x;
+    }
+};
+
+struct PyrDownVec_32s16s
+{
+    int operator()(int** src, short* dst, int, int width) const
+    {
+        int x = 0;
+        const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+        int32x4_t v_delta = vdupq_n_s32(128);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
+            int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
+            int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
+            int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
+            int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
+
+            v_r00 = vaddq_s32(vqaddq_s32(v_r00, v_r40), vqaddq_s32(v_r20, v_r20));
+            v_r10 = vaddq_s32(vqaddq_s32(v_r10, v_r20), v_r30);
+            int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r00, vshlq_n_s32(v_r10, 2)), v_delta), 8);
+
+            v_r01 = vaddq_s32(vqaddq_s32(v_r01, v_r41), vqaddq_s32(v_r21, v_r21));
+            v_r11 = vaddq_s32(vqaddq_s32(v_r11, v_r21), v_r31);
+            int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vqaddq_s32(v_r01, vshlq_n_s32(v_r11, 2)), v_delta), 8);
+
+            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
+        }
+
+        return x;
+    }
+};
+
+struct PyrDownVec_32f
+{
+    int operator()(float** src, float* dst, int, int width) const
+    {
+        int x = 0;
+        const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
+        float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_r0 = vld1q_f32(row0 + x);
+            float32x4_t v_r1 = vld1q_f32(row1 + x);
+            float32x4_t v_r2 = vld1q_f32(row2 + x);
+            float32x4_t v_r3 = vld1q_f32(row3 + x);
+            float32x4_t v_r4 = vld1q_f32(row4 + x);
+
+            v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
+            v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
+            vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
+
+            v_r0 = vld1q_f32(row0 + x + 4);
+            v_r1 = vld1q_f32(row1 + x + 4);
+            v_r2 = vld1q_f32(row2 + x + 4);
+            v_r3 = vld1q_f32(row3 + x + 4);
+            v_r4 = vld1q_f32(row4 + x + 4);
+
+            v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
+            v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
+            vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
+        }
+
+        return x;
+    }
+};
+
 #else
 
 typedef NoVec<int, uchar> PyrDownVec_32s8u;
+typedef NoVec<int, ushort> PyrDownVec_32s16u;
+typedef NoVec<int, short> PyrDownVec_32s16s;
 typedef NoVec<float, float> PyrDownVec_32f;
 
 #endif
@@ -561,9 +705,9 @@ void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borde
     if( depth == CV_8U )
         func = pyrDown_<FixPtCast<uchar, 8>, PyrDownVec_32s8u>;
     else if( depth == CV_16S )
-        func = pyrDown_<FixPtCast<short, 8>, NoVec<int, short> >;
+        func = pyrDown_<FixPtCast<short, 8>, PyrDownVec_32s16s >;
     else if( depth == CV_16U )
-        func = pyrDown_<FixPtCast<ushort, 8>, NoVec<int, ushort> >;
+        func = pyrDown_<FixPtCast<ushort, 8>, PyrDownVec_32s16u >;
     else if( depth == CV_32F )
         func = pyrDown_<FltCast<float, 8>, PyrDownVec_32f>;
     else if( depth == CV_64F )

From 00b8f0dec1ffd749fe4f933ed0c06804d48bd0b6 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 8 Oct 2014 12:04:49 -0700
Subject: [PATCH 36/40] cv::pyrUp

---
 modules/imgproc/src/pyramids.cpp | 39 ++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 0154813f4..ad10f2902 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -320,6 +320,37 @@ struct PyrDownVec_32f
     }
 };
 
+struct PyrUpVec_32f
+{
+    int operator()(float** src, float* dst, int, int width) const
+    {
+        int x = 0;
+        float ** dsts = (float **)dst;
+        const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
+        float *dst0 = dsts[0], *dst1 = dsts[1];
+        float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_r0 = vld1q_f32(row0 + x);
+            float32x4_t v_r1 = vld1q_f32(row1 + x);
+            float32x4_t v_r2 = vld1q_f32(row2 + x);
+
+            vst1q_f32(dst1 + x, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
+            vst1q_f32(dst0 + x, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
+
+            v_r0 = vld1q_f32(row0 + x + 4);
+            v_r1 = vld1q_f32(row1 + x + 4);
+            v_r2 = vld1q_f32(row2 + x + 4);
+
+            vst1q_f32(dst1 + x + 4, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
+            vst1q_f32(dst0 + x + 4, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
+        }
+
+        return x;
+    }
+};
+
 #else
 
 typedef NoVec<int, uchar> PyrDownVec_32s8u;
@@ -327,6 +358,8 @@ typedef NoVec<int, ushort> PyrDownVec_32s16u;
 typedef NoVec<int, short> PyrDownVec_32s16s;
 typedef NoVec<float, float> PyrDownVec_32f;
 
+typedef NoVec<float, float> PyrUpVec_32f;
+
 #endif
 
 template<class CastOp, class VecOp> void
@@ -469,6 +502,7 @@ pyrUp_( const Mat& _src, Mat& _dst, int)
     AutoBuffer<int> _dtab(ssize.width*cn);
     int* dtab = _dtab;
     WT* rows[PU_SZ];
+    T* dsts[2];
     CastOp castOp;
     VecOp vecOp;
 
@@ -529,8 +563,9 @@ pyrUp_( const Mat& _src, Mat& _dst, int)
         for( k = 0; k < PU_SZ; k++ )
             rows[k] = buf + ((y - PU_SZ/2 + k - sy0) % PU_SZ)*bufstep;
         row0 = rows[0]; row1 = rows[1]; row2 = rows[2];
+        dsts[0] = dst0; dsts[1] = dst1;
 
-        x = vecOp(rows, dst0, (int)_dst.step, dsize.width);
+        x = vecOp(rows, (T*)dsts, (int)_dst.step, dsize.width);
         for( ; x < dsize.width; x++ )
         {
             T t1 = castOp((row1[x] + row2[x])*4);
@@ -780,7 +815,7 @@ void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderT
     else if( depth == CV_16U )
         func = pyrUp_<FixPtCast<ushort, 6>, NoVec<int, ushort> >;
     else if( depth == CV_32F )
-        func = pyrUp_<FltCast<float, 6>, NoVec<float, float> >;
+        func = pyrUp_<FltCast<float, 6>, PyrUpVec_32f >;
     else if( depth == CV_64F )
         func = pyrUp_<FltCast<double, 6>, NoVec<double, double> >;
     else

From 183e378bd069796f30e69c4d597b743b1f6cf003 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 8 Oct 2014 12:50:29 -0700
Subject: [PATCH 37/40] cv::resize (INTER_LINEAR && INTER_CUBIC)

---
 modules/imgproc/src/imgwarp.cpp | 179 +++++++++++++++++++++++++++++++-
 1 file changed, 177 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index f4c2cf287..a0b19dfb6 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -49,8 +49,6 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 
-#include <iostream>
-
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 static IppStatus sts = ippInit();
 #endif
@@ -896,6 +894,183 @@ struct VResizeCubicVec_32f
     }
 };
 
+#elif CV_NEON
+
+typedef VResizeNoVec VResizeLinearVec_32s8u;
+
+struct VResizeLinearVec_32f16u
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1];
+        ushort* dst = (ushort*)_dst;
+        int x = 0;
+
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
+            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
+
+            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
+            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
+
+            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeLinearVec_32f16s
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1];
+        short* dst = (short*)_dst;
+        int x = 0;
+
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
+            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
+
+            float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
+            float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
+
+            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
+                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeLinearVec_32f
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1];
+        float* dst = (float*)_dst;
+        int x = 0;
+
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
+            float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
+
+            vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
+            vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
+        }
+
+        return x;
+    }
+};
+
+typedef VResizeNoVec VResizeCubicVec_32s8u;
+
+struct VResizeCubicVec_32f16u
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        ushort* dst = (ushort*)_dst;
+        int x = 0;
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                                                         v_b1, vld1q_f32(S1 + x)),
+                                                                         v_b2, vld1q_f32(S2 + x)),
+                                                                         v_b3, vld1q_f32(S3 + x));
+            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                                                         v_b1, vld1q_f32(S1 + x + 4)),
+                                                                         v_b2, vld1q_f32(S2 + x + 4)),
+                                                                         v_b3, vld1q_f32(S3 + x + 4));
+
+            vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
+                                            vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeCubicVec_32f16s
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        short* dst = (short*)_dst;
+        int x = 0;
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                                                         v_b1, vld1q_f32(S1 + x)),
+                                                                         v_b2, vld1q_f32(S2 + x)),
+                                                                         v_b3, vld1q_f32(S3 + x));
+            float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                                                         v_b1, vld1q_f32(S1 + x + 4)),
+                                                                         v_b2, vld1q_f32(S2 + x + 4)),
+                                                                         v_b3, vld1q_f32(S3 + x + 4));
+
+            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
+                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
+        }
+
+        return x;
+    }
+};
+
+struct VResizeCubicVec_32f
+{
+    int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
+    {
+        const float** src = (const float**)_src;
+        const float* beta = (const float*)_beta;
+        const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
+        float* dst = (float*)_dst;
+        int x = 0;
+        float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
+                    v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
+                                                                       v_b1, vld1q_f32(S1 + x)),
+                                                                       v_b2, vld1q_f32(S2 + x)),
+                                                                       v_b3, vld1q_f32(S3 + x)));
+            vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
+                                                                          v_b1, vld1q_f32(S1 + x + 4)),
+                                                                          v_b2, vld1q_f32(S2 + x + 4)),
+                                                                          v_b3, vld1q_f32(S3 + x + 4)));
+        }
+
+        return x;
+    }
+};
+
 #else
 
 typedef VResizeNoVec VResizeLinearVec_32s8u;

From 4babecf3b0e3512fd9f168ae8ed69de53c1853f6 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Thu, 9 Oct 2014 12:38:14 +0000
Subject: [PATCH 38/40] fixes for cv::addWeighted and cv::Mat::dot

---
 modules/core/src/arithm.cpp | 16 ++++++++--------
 modules/core/src/matmul.cpp |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index a1c0d3cfd..b7f02c525 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -2638,8 +2638,8 @@ struct AddWeighted_SIMD<schar, float>
             out_f_l = vaddq_f32(out_f_l, g);
             out_f_h = vaddq_f32(out_f_h, g);
 
-            int16x4_t out_16_l = vqmovn_s32(vcvtq_s32_f32(out_f_l));
-            int16x4_t out_16_h = vqmovn_s32(vcvtq_s32_f32(out_f_h));
+            int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
+            int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
 
             int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
             int8x8_t out = vqmovn_s16(out_16);
@@ -2666,11 +2666,11 @@ struct AddWeighted_SIMD<ushort, float>
 
             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
-            uint16x4_t v_dst1 = vqmovn_u32(vcvtq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+            uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
 
             v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
             v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
-            uint16x4_t v_dst2 = vqmovn_u32(vcvtq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+            uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
 
             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
         }
@@ -2694,11 +2694,11 @@ struct AddWeighted_SIMD<short, float>
 
             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
-            int16x4_t v_dst1 = vqmovn_s32(vcvtq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+            int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
 
             v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
             v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
-            int16x4_t v_dst2 = vqmovn_s32(vcvtq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+            int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
 
             vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
         }
@@ -2801,8 +2801,8 @@ addWeighted8u( const uchar* src1, size_t step1,
             out_f_l = vaddq_f32(out_f_l, g);
             out_f_h = vaddq_f32(out_f_h, g);
 
-            uint16x4_t out_16_l = vqmovun_s32(vcvtq_s32_f32(out_f_l));
-            uint16x4_t out_16_h = vqmovun_s32(vcvtq_s32_f32(out_f_h));
+            uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l));
+            uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h));
 
             uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
             uint8x8_t out = vqmovn_u16(out_16);
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index b86b5929e..2501fb2d0 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -3008,7 +3008,7 @@ static double dotProd_32f(const float* src1, const float* src2, int len)
         return r;
     setIppErrorStatus();
 #elif CV_NEON
-    int len0 = len & -4, blockSize0 = (1 << 15), blockSize;
+    int len0 = len & -4, blockSize0 = (1 << 13), blockSize;
     float32x4_t v_zero = vdupq_n_f32(0.0f);
     CV_DECL_ALIGNED(16) float buf[4];
 

From 5f23d999185e8392eccf0f21b6434e4d4814c27f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Fri, 10 Oct 2014 11:05:19 +0000
Subject: [PATCH 39/40] the rest modes of cv::Mat::convertTo

---
 modules/core/src/convert.cpp     | 62 ++++++++++++++++++++++++++++++++
 modules/imgproc/src/corner.cpp   |  2 ++
 modules/imgproc/src/pyramids.cpp |  2 ++
 3 files changed, 66 insertions(+)

diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 67adfde74..9d758e7f1 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -2447,6 +2447,25 @@ struct Cvt_SIMD<schar, short>
     }
 };
 
+template <>
+struct Cvt_SIMD<schar, ushort>
+{
+    int operator() (const schar * src, ushort * dst, int width) const
+    {
+        int x = 0;
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
+            vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))),
+                                            vqmovun_s32(vmovl_s16(vget_high_s16(v_src)))));
+        }
+
+        return x;
+    }
+};
+
+
 template <>
 struct Cvt_SIMD<schar, int>
 {
@@ -2502,6 +2521,49 @@ struct Cvt_SIMD<ushort, uchar>
     }
 };
 
+template <>
+struct Cvt_SIMD<ushort, schar>
+{
+    int operator() (const ushort * src, schar * dst, int width) const
+    {
+        int x = 0;
+
+        for ( ; x <= width - 16; x += 16)
+        {
+            uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
+            int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1)));
+            int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1)));
+            int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2)));
+            int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2)));
+
+            vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))),
+                                          vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21)))));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Cvt_SIMD<ushort, short>
+{
+    int operator() (const ushort * src, short * dst, int width) const
+    {
+        int x = 0;
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            uint16x8_t v_src = vld1q_u16(src + x);
+            int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)));
+            int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)));
+
+            vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
+        }
+
+        return x;
+    }
+};
+
 template <>
 struct Cvt_SIMD<ushort, int>
 {
diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp
index 4d55a4f18..096997a60 100644
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -618,7 +618,9 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
     if( src.depth() == CV_8U )
         factor *= 255;
     factor = 1./(factor * factor * factor);
+#if CV_NEON || CV_SSE2
     float factor_f = (float)factor;
+#endif
 
 #if CV_SSE2
     volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index ad10f2902..8a8515e7a 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -181,6 +181,8 @@ struct PyrDownVec_32f
 typedef NoVec<int, ushort> PyrDownVec_32s16u;
 typedef NoVec<int, short> PyrDownVec_32s16s;
 
+typedef NoVec<float, float> PyrUpVec_32f;
+
 #elif CV_NEON
 
 struct PyrDownVec_32s8u

From af04a853033aab12af23ce2494e4ac0f0b74f0dd Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Fri, 10 Oct 2014 10:00:13 -0700
Subject: [PATCH 40/40] fixes for cv::accumulate**

---
 modules/imgproc/src/accum.cpp | 49 +++++------------------------------
 1 file changed, 7 insertions(+), 42 deletions(-)

diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp
index 7fd2f2a10..b8906a1ca 100644
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@@ -96,7 +96,7 @@ struct Acc_SIMD<uchar, float>
             len *= cn;
             for ( ; x <= len - 16; x += 16)
             {
-                uint8x16_t v_src = vld1q_u8(src);
+                uint8x16_t v_src = vld1q_u8(src + x);
                 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
 
                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
@@ -111,7 +111,7 @@ struct Acc_SIMD<uchar, float>
 
             for ( ; x <= len - 16; x += 16)
             {
-                uint8x16_t v_src = vandq_u8(vld1q_u8(src), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
+                uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
                 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
 
                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
@@ -182,7 +182,7 @@ struct AccSqr_SIMD<uchar, float>
             len *= cn;
             for ( ; x <= len - 16; x += 16)
             {
-                uint8x16_t v_src = vld1q_u8(src);
+                uint8x16_t v_src = vld1q_u8(src + x);
                 uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src);
                 uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1);
 
@@ -198,7 +198,7 @@ struct AccSqr_SIMD<uchar, float>
 
             for ( ; x <= len - 16; x += 16)
             {
-                uint8x16_t v_src = vandq_u8(vld1q_u8(src), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
+                uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
                 uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src);
                 uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1);
 
@@ -292,7 +292,7 @@ struct AccProd_SIMD<uchar, float>
             len *= cn;
             for ( ; x <= len - 16; x += 16)
             {
-                uint8x16_t v_1src = vld1q_u8(src1), v_2src = vld1q_u8(src2);
+                uint8x16_t v_1src = vld1q_u8(src1 + x), v_2src = vld1q_u8(src2 + x);
                 uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)),
                            v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src));
 
@@ -309,7 +309,7 @@ struct AccProd_SIMD<uchar, float>
             for ( ; x <= len - 16; x += 16)
             {
                 uint8x16_t v_mask = veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0));
-                uint8x16_t v_1src = vandq_u8(vld1q_u8(src1), v_mask), v_2src = vandq_u8(vld1q_u8(src2), v_mask);
+                uint8x16_t v_1src = vandq_u8(vld1q_u8(src1 + x), v_mask), v_2src = vandq_u8(vld1q_u8(src2 + x), v_mask);
                 uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)),
                            v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src));
 
@@ -402,7 +402,7 @@ struct AccW_SIMD<uchar, float>
             len *= cn;
             for ( ; x <= len - 16; x += 16)
             {
-                uint8x16_t v_src = vld1q_u8(src);
+                uint8x16_t v_src = vld1q_u8(src + x);
                 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
 
                 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta),
@@ -415,25 +415,6 @@ struct AccW_SIMD<uchar, float>
                                                   vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_alpha));
             }
         }
-        else if (cn == 1)
-        {
-            uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
-
-            for ( ; x <= len - 16; x += 16)
-            {
-                uint8x16_t v_src = vandq_u8(vld1q_u8(src), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
-                uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
-
-                vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta),
-                                             vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), v_alpha));
-                vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta),
-                                                 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), v_alpha));
-                vst1q_f32(dst + x + 8, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 8), v_beta),
-                                                 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_alpha));
-                vst1q_f32(dst + x + 12, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 12), v_beta),
-                                                  vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_alpha));
-            }
-        }
 
         return x;
     }
@@ -459,22 +440,6 @@ struct AccW_SIMD<ushort, float>
                 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vcvtq_f32_u32(v_src1), v_alpha));
             }
         }
-        else if (cn == 1)
-        {
-            uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0);
-
-            for ( ; x <= len - 8; x += 8)
-            {
-                uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0));
-                uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src);
-                uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])),
-                           v_src = vandq_u16(vld1q_u16(src + x), v_mask);
-                uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src));
-
-                vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vcvtq_f32_u32(v_src0), v_alpha));
-                vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vcvtq_f32_u32(v_src1), v_alpha));
-            }
-        }
 
         return x;
     }