From 225da0226e4e9d54e97aee488ac2b5ed4e813443 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B9=20=D0=9D=D1=83=D0=B6?=
 =?UTF-8?q?=D0=BD=D1=8B=D0=B9?= <snuzhny@stc-spb.ru>
Date: Tue, 7 Apr 2015 15:09:47 +0300
Subject: [PATCH 01/48] Fix bug in SVM::trainAuto

---
 modules/ml/src/svm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 449eb8dcd..b74df5f0a 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -1787,7 +1787,7 @@ public:
                 if( !do_train( temp_train_samples, temp_train_responses ))
                     continue;
 
-                for( i = 0; i < test_sample_count; i++ )
+				for (i = 0; i < train_sample_count; i++)
                 {
                     j = sidx[(i+start+train_sample_count) % sample_count];
                     memcpy(temp_train_samples.ptr(i), samples.ptr(j), sample_size);

From 0c24ccf53897c60dff19193623a97fc4713ce14e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B9=20=D0=9D=D1=83=D0=B6?=
 =?UTF-8?q?=D0=BD=D1=8B=D0=B9?= <snuzhny@stc-spb.ru>
Date: Tue, 7 Apr 2015 15:43:59 +0300
Subject: [PATCH 02/48] Revert "Fix bug in SVM::trainAuto"

This reverts commit 225da0226e4e9d54e97aee488ac2b5ed4e813443.
---
 modules/ml/src/svm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index b74df5f0a..449eb8dcd 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -1787,7 +1787,7 @@ public:
                 if( !do_train( temp_train_samples, temp_train_responses ))
                     continue;
 
-				for (i = 0; i < train_sample_count; i++)
+                for( i = 0; i < test_sample_count; i++ )
                 {
                     j = sidx[(i+start+train_sample_count) % sample_count];
                     memcpy(temp_train_samples.ptr(i), samples.ptr(j), sample_size);

From 3646967deb9b48f45c44203e5c96836a8a180218 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B9=20=D0=9D=D1=83=D0=B6?=
 =?UTF-8?q?=D0=BD=D1=8B=D0=B9?= <snuzhny@stc-spb.ru>
Date: Tue, 7 Apr 2015 15:48:30 +0300
Subject: [PATCH 03/48] Fix bug in SVM::trainAuto

---
 modules/ml/src/svm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 449eb8dcd..95b5fb97d 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -1787,7 +1787,7 @@ public:
                 if( !do_train( temp_train_samples, temp_train_responses ))
                     continue;
 
-                for( i = 0; i < test_sample_count; i++ )
+                for( i = 0; i < train_sample_count; i++ )
                 {
                     j = sidx[(i+start+train_sample_count) % sample_count];
                     memcpy(temp_train_samples.ptr(i), samples.ptr(j), sample_size);

From e800800bec8596127274f359e744723a7371b5b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B9=20=D0=9D=D1=83=D0=B6?=
 =?UTF-8?q?=D0=BD=D1=8B=D0=B9?= <snuzhny@stc-spb.ru>
Date: Wed, 8 Apr 2015 12:43:38 +0300
Subject: [PATCH 04/48] Squashed commit of the following:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 3646967deb9b48f45c44203e5c96836a8a180218
Author: РЎРµСЂРіРµР№ РќСѓР¶РЅС‹Р№ <snuzhny@stc-spb.ru>
Date:   Tue Apr 7 15:48:30 2015 +0300

    Fix bug in SVM::trainAuto

commit 0c24ccf53897c60dff19193623a97fc4713ce14e
Author: РЎРµСЂРіРµР№ РќСѓР¶РЅС‹Р№ <snuzhny@stc-spb.ru>
Date:   Tue Apr 7 15:43:59 2015 +0300

    Revert "Fix bug in SVM::trainAuto"

    This reverts commit 225da0226e4e9d54e97aee488ac2b5ed4e813443.

commit 225da0226e4e9d54e97aee488ac2b5ed4e813443
Author: РЎРµСЂРіРµР№ РќСѓР¶РЅС‹Р№ <snuzhny@stc-spb.ru>
Date:   Tue Apr 7 15:09:47 2015 +0300

    Fix bug in SVM::trainAuto
---
 modules/ml/src/svm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 449eb8dcd..95b5fb97d 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -1787,7 +1787,7 @@ public:
                 if( !do_train( temp_train_samples, temp_train_responses ))
                     continue;
 
-                for( i = 0; i < test_sample_count; i++ )
+                for( i = 0; i < train_sample_count; i++ )
                 {
                     j = sidx[(i+start+train_sample_count) % sample_count];
                     memcpy(temp_train_samples.ptr(i), samples.ptr(j), sample_size);

From 4a65f3cee2a657f5e7da7c9ed26ad9576417dc04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B9=20=D0=9D=D1=83=D0=B6?=
 =?UTF-8?q?=D0=BD=D1=8B=D0=B9?= <snuzhny@stc-spb.ru>
Date: Wed, 8 Apr 2015 12:51:57 +0300
Subject: [PATCH 05/48] Squashed commit of the following:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 5c379aa1b83ddd66f67b1a9c2ccd00e5bac8576e
Merge: e800800 3646967
Author: РЎРµСЂРіРµР№ РќСѓР¶РЅС‹Р№ <snuzhny@stc-spb.ru>
Date:   Wed Apr 8 12:44:46 2015 +0300

    Merge branch 'master' of https://github.com/Nuzhny007/opencv

commit e800800bec8596127274f359e744723a7371b5b7
Author: РЎРµСЂРіРµР№ РќСѓР¶РЅС‹Р№ <snuzhny@stc-spb.ru>
Date:   Wed Apr 8 12:43:38 2015 +0300

    Squashed commit of the following:

    commit 3646967deb9b48f45c44203e5c96836a8a180218
    Author: Р РЋР ВµРЎР‚Р С–Р ВµР в„– Р СњРЎС“Р В¶Р Р…РЎвЂ№Р в„– <snuzhny@stc-spb.ru>
    Date:   Tue Apr 7 15:48:30 2015 +0300

        Fix bug in SVM::trainAuto

    commit 0c24ccf53897c60dff19193623a97fc4713ce14e
    Author: Р РЋР ВµРЎР‚Р С–Р ВµР в„– Р СњРЎС“Р В¶Р Р…РЎвЂ№Р в„– <snuzhny@stc-spb.ru>
    Date:   Tue Apr 7 15:43:59 2015 +0300

        Revert "Fix bug in SVM::trainAuto"

        This reverts commit 225da0226e4e9d54e97aee488ac2b5ed4e813443.

    commit 225da0226e4e9d54e97aee488ac2b5ed4e813443
    Author: Р РЋР ВµРЎР‚Р С–Р ВµР в„– Р СњРЎС“Р В¶Р Р…РЎвЂ№Р в„– <snuzhny@stc-spb.ru>
    Date:   Tue Apr 7 15:09:47 2015 +0300

        Fix bug in SVM::trainAuto

commit 3646967deb9b48f45c44203e5c96836a8a180218
Author: РЎРµСЂРіРµР№ РќСѓР¶РЅС‹Р№ <snuzhny@stc-spb.ru>
Date:   Tue Apr 7 15:48:30 2015 +0300

    Fix bug in SVM::trainAuto

commit 0c24ccf53897c60dff19193623a97fc4713ce14e
Author: РЎРµСЂРіРµР№ РќСѓР¶РЅС‹Р№ <snuzhny@stc-spb.ru>
Date:   Tue Apr 7 15:43:59 2015 +0300

    Revert "Fix bug in SVM::trainAuto"

    This reverts commit 225da0226e4e9d54e97aee488ac2b5ed4e813443.

commit 225da0226e4e9d54e97aee488ac2b5ed4e813443
Author: РЎРµСЂРіРµР№ РќСѓР¶РЅС‹Р№ <snuzhny@stc-spb.ru>
Date:   Tue Apr 7 15:09:47 2015 +0300

    Fix bug in SVM::trainAuto
---
 modules/ml/src/svm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 449eb8dcd..95b5fb97d 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -1787,7 +1787,7 @@ public:
                 if( !do_train( temp_train_samples, temp_train_responses ))
                     continue;
 
-                for( i = 0; i < test_sample_count; i++ )
+                for( i = 0; i < train_sample_count; i++ )
                 {
                     j = sidx[(i+start+train_sample_count) % sample_count];
                     memcpy(temp_train_samples.ptr(i), samples.ptr(j), sample_size);

From 2f95acf273c2034fd01120bedb6fda1a40a49309 Mon Sep 17 00:00:00 2001
From: "S. Garrido" <sgarrido2011@gmail.com>
Date: Sat, 11 Apr 2015 19:03:17 +0200
Subject: [PATCH 06/48] completing and fixing gen_pattern.py

---
 doc/pattern_tools/gen_pattern.py | 63 +++++++++++++++-----------------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/doc/pattern_tools/gen_pattern.py b/doc/pattern_tools/gen_pattern.py
index 3643b6d3b..fc1e74bbc 100755
--- a/doc/pattern_tools/gen_pattern.py
+++ b/doc/pattern_tools/gen_pattern.py
@@ -1,13 +1,19 @@
 #!/usr/bin/env python
 
 """gen_pattern.py
-To run:
--c 10 -r 12 -o out.svg
--T type of pattern, circles, acircles, checkerboard
--s --square_size size of squares in pattern
--u --units mm, inches, px, m
--w  page width in units
--h  page height in units
+Usage example:
+python gen_pattern.py -o out.svg -r 11 -c 8 -T circles -s 20.0 -R 5.0 -u mm -w 216 -h 279
+
+-o, --output - output file (default out.svg)
+-r, --rows - pattern rows (default 11)
+-c, --columns - pattern columns (default 8)
+-T, --type - type of pattern, circles, acircles, checkerboard (default circles)
+-s, --square_size - size of squares in pattern (default 20.0)
+-R, --radius_rate - circles_radius = square_size/radius_rate (default 5.0)
+-u, --units - mm, inches, px, m (default mm)
+-w, --page_width - page width in units (default 216)
+-h, --page_height - page height in units (default 279)
+-H, --help - show help
 """
 
 from svgfig import *
@@ -16,18 +22,20 @@ import sys
 import getopt
 
 class PatternMaker:
-  def __init__(self, cols,rows,output,units,square_size,page_width,page_height):
+  def __init__(self, cols,rows,output,units,square_size,radius_rate,page_width,page_height):
     self.cols = cols
     self.rows = rows
     self.output = output
     self.units = units
     self.square_size = square_size
+    self.radius_rate = radius_rate
     self.width = page_width
     self.height = page_height
     self.g = SVG("g") # the svg group container
+
   def makeCirclesPattern(self):
     spacing = self.square_size
-    r = spacing / 5.0 #radius is a 5th of the spacing TODO parameterize
+    r = spacing / self.radius_rate
     for x in range(1,self.cols+1):
       for y in range(1,self.rows+1):
         dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black")
@@ -35,7 +43,7 @@ class PatternMaker:
 
   def makeACirclesPattern(self):
     spacing = self.square_size
-    r = spacing / 5.0
+    r = spacing / self.radius_rate
     for i in range(0,self.rows):
       for j in range(0,self.cols):
         dot = SVG("circle", cx= ((j*2 + i%2)*spacing) + spacing, cy=self.height - (i * spacing + spacing), r=r, fill="black")
@@ -43,37 +51,23 @@ class PatternMaker:
 
   def makeCheckerboardPattern(self):
     spacing = self.square_size
-    r = spacing / 5.0
     for x in range(1,self.cols+1):
       for y in range(1,self.rows+1):
-        #TODO make a checkerboard pattern
-        dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black")
-        self.g.append(dot)
+        if x%2 == y%2:
+          dot = SVG("rect", x=x * spacing, y=y * spacing, width=spacing, height=spacing, stroke_width="0", fill="black")
+          self.g.append(dot)
+
   def save(self):
     c = canvas(self.g,width="%d%s"%(self.width,self.units),height="%d%s"%(self.height,self.units),viewBox="0 0 %d %d"%(self.width,self.height))
     c.inkview(self.output)
 
-def makePattern(cols,rows,output,p_type,units,square_size,page_width,page_height):
-    width = page_width
-    spacing = square_size
-    height = page_height
-    r = spacing / 5.0
-    g = SVG("g") # the svg group container
-    for x in range(1,cols+1):
-      for y in range(1,rows+1):
-        if "circle" in p_type:
-          dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black")
-        g.append(dot)
-    c = canvas(g,width="%d%s"%(width,units),height="%d%s"%(height,units),viewBox="0 0 %d %d"%(width,height))
-    c.inkview(output)
-
 
 def main():
     # parse command line options, TODO use argparse for better doc
     try:
-        opts, args = getopt.getopt(sys.argv[1:], "ho:c:r:T:u:s:w:h:", ["help","output","columns","rows",
-                                                                      "type","units","square_size","page_width",
-                                                                      "page_height"])
+        opts, args = getopt.getopt(sys.argv[1:], "Ho:c:r:T:u:s:R:w:h:", ["help","output=","columns=","rows=",
+                                                                      "type=","units=","square_size=","radius_rate=",
+                                                                      "page_width=","page_height="])
     except getopt.error, msg:
         print msg
         print "for help use --help"
@@ -84,11 +78,12 @@ def main():
     p_type = "circles"
     units = "mm"
     square_size = 20.0
+    radius_rate = 5.0
     page_width = 216    #8.5 inches
     page_height = 279   #11 inches
     # process options
     for o, a in opts:
-        if o in ("-h", "--help"):
+        if o in ("-H", "--help"):
             print __doc__
             sys.exit(0)
         elif o in ("-r", "--rows"):
@@ -103,11 +98,13 @@ def main():
             units = a
         elif o in ("-s", "--square_size"):
             square_size = float(a)
+        elif o in ("-R", "--radius_rate"):
+            radius_rate = float(a)
         elif o in ("-w", "--page_width"):
             page_width = float(a)
         elif o in ("-h", "--page_height"):
             page_height = float(a)
-    pm = PatternMaker(columns,rows,output,units,square_size,page_width,page_height)
+    pm = PatternMaker(columns,rows,output,units,square_size,radius_rate,page_width,page_height)
     #dict for easy lookup of pattern type
     mp = {"circles":pm.makeCirclesPattern,"acircles":pm.makeACirclesPattern,"checkerboard":pm.makeCheckerboardPattern}
     mp[p_type]()

From 3c797b0ca55b082c98a0d4dd8cc120d7279f3f20 Mon Sep 17 00:00:00 2001
From: ttagu99 <ttagu99@gmail.com>
Date: Tue, 14 Apr 2015 10:01:53 +0900
Subject: [PATCH 07/48] findcontour_example check image empty

---
 .../tutorial_code/ShapeDescriptors/findContours_demo.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp b/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
index cd29b1c2e..6a6de9539 100644
--- a/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
+++ b/samples/cpp/tutorial_code/ShapeDescriptors/findContours_demo.cpp
@@ -27,8 +27,13 @@ void thresh_callback(int, void* );
  */
 int main( int, char** argv )
 {
-  /// Load source image and convert it to gray
-  src = imread( argv[1], 1 );
+  /// Load source image
+  src = imread(argv[1]);
+  if (src.empty())
+  {
+    cerr << "No image supplied ..." << endl;
+    return -1;
+  }
 
   /// Convert image to gray and blur it
   cvtColor( src, src_gray, COLOR_BGR2GRAY );

From 3b177eaaa3bd2933e9910e21bfdf5a23af8e883d Mon Sep 17 00:00:00 2001
From: Yang Fan <stoneyang0915@gmail.com>
Date: Tue, 14 Apr 2015 13:39:00 +0800
Subject: [PATCH 08/48] ENH: explicitly declared outputFilename to surpress
 error C2668

---
 samples/gpu/video_writer.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/gpu/video_writer.cpp b/samples/gpu/video_writer.cpp
index 5a48c69aa..6c5d1412d 100644
--- a/samples/gpu/video_writer.cpp
+++ b/samples/gpu/video_writer.cpp
@@ -69,7 +69,8 @@ int main(int argc, const char* argv[])
         {
             std::cout << "Open CUDA Writer" << std::endl;
 
-            d_writer = cv::cudacodec::createVideoWriter("output_gpu.avi", frame.size(), FPS);
+            const cv::String outputFilename = "output_gpu.avi";
+            d_writer = cv::cudacodec::createVideoWriter(outputFilename, frame.size(), FPS);
         }
 
         d_frame.upload(frame);

From 4686b935c172cc56707966f7e6e42372e8f709cf Mon Sep 17 00:00:00 2001
From: Daeyun Shin <daeyun@daeyunshin.com>
Date: Tue, 14 Apr 2015 07:44:32 -0500
Subject: [PATCH 09/48] Fix typo in documentation. F = K^-T * E * K^-1

---
 modules/calib3d/include/opencv2/calib3d.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 7b01a7bbc..744ded305 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -1200,7 +1200,7 @@ for the other points. The array is computed only in the RANSAC and LMedS methods
 This function estimates essential matrix based on the five-point algorithm solver in @cite Nister03 .
 @cite SteweniusCFS is also a related. The epipolar geometry is described by the following equation:
 
-\f[[p_2; 1]^T K^T E K [p_1; 1] = 0 \\\f]\f[K =
+\f[[p_2; 1]^T K^{-T} E K^{-1} [p_1; 1] = 0 \\\f]\f[K =
 \begin{bmatrix}
 f & 0 & x_{pp}  \\
 0 & f & y_{pp}  \\

From 17bedd72f03eb3c0df6f162c4d285add85a84724 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 14 Apr 2015 16:55:11 +0300
Subject: [PATCH 10/48] workaround for stange internal compiler error when
 cross-compiling with 4.6

---
 modules/core/src/out.cpp | 46 ++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/modules/core/src/out.cpp b/modules/core/src/out.cpp
index 89919715e..2c6ba5396 100644
--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@@ -43,9 +43,9 @@
 
 #include "precomp.hpp"
 
-namespace
+namespace cv
 {
-    class FormattedImpl : public cv::Formatted
+    class FormattedImpl : public Formatted
     {
         enum { STATE_PROLOGUE, STATE_EPILOGUE, STATE_INTERLUDE,
                STATE_ROW_OPEN, STATE_ROW_CLOSE, STATE_CN_OPEN, STATE_CN_CLOSE, STATE_VALUE, STATE_FINISHED,
@@ -55,7 +55,7 @@ namespace
         char floatFormat[8];
         char buf[32];   // enough for double with precision up to 20
 
-        cv::Mat mtx;
+        Mat mtx;
         int mcn; // == mtx.channels()
         bool singleLine;
         bool alignOrder;    // true when cn first order
@@ -65,8 +65,8 @@ namespace
         int col;
         int cn;
 
-        cv::String prologue;
-        cv::String epilogue;
+        String prologue;
+        String epilogue;
         char braces[5];
 
         void (FormattedImpl::*valueToStr)();
@@ -81,7 +81,7 @@ namespace
 
     public:
 
-        FormattedImpl(cv::String pl, cv::String el, cv::Mat m, char br[5], bool sLine, bool aOrder, int precision)
+        FormattedImpl(String pl, String el, Mat m, char br[5], bool sLine, bool aOrder, int precision)
         {
             CV_Assert(m.dims <= 2);
 
@@ -253,7 +253,7 @@ namespace
         }
     };
 
-    class FormatterBase : public cv::Formatter
+    class FormatterBase : public Formatter
     {
     public:
         FormatterBase() : prec32f(8), prec64f(16), multiline(true) {}
@@ -278,14 +278,15 @@ namespace
         int prec64f;
         int multiline;
     };
+
     class DefaultFormatter : public FormatterBase
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             char braces[5] = {'\0', '\0', ';', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>("[", "]", mtx, &*braces,
+            return makePtr<FormattedImpl>("[", "]", mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
@@ -294,10 +295,10 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             char braces[5] = {'\0', '\0', ';', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>("", "", mtx, &*braces,
+            return makePtr<FormattedImpl>("", "", mtx, &*braces,
                 mtx.rows == 1 || !multiline, true, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
@@ -306,12 +307,12 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             char braces[5] = {'[', ']', '\0', '[', ']'};
             if (mtx.cols == 1)
                 braces[0] = braces[1] = '\0';
-            return cv::makePtr<FormattedImpl>("[", "]", mtx, &*braces,
+            return makePtr<FormattedImpl>("[", "]", mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
@@ -320,7 +321,7 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             static const char* numpyTypes[] =
             {
@@ -329,7 +330,7 @@ namespace
             char braces[5] = {'[', ']', '\0', '[', ']'};
             if (mtx.cols == 1)
                 braces[0] = braces[1] = '\0';
-            return cv::makePtr<FormattedImpl>("array([",
+            return makePtr<FormattedImpl>("array([",
                 cv::format("], type='%s')", numpyTypes[mtx.depth()]), mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
@@ -339,11 +340,11 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             char braces[5] = {'\0', '\0', '\0', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>(cv::String(),
-                mtx.rows > 1 ? cv::String("\n") : cv::String(), mtx, &*braces,
+            return makePtr<FormattedImpl>(String(),
+                mtx.rows > 1 ? String("\n") : String(), mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
@@ -352,19 +353,14 @@ namespace
     {
     public:
 
-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
         {
             char braces[5] = {'\0', '\0', ',', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>("{", "}", mtx, &*braces,
+            return makePtr<FormattedImpl>("{", "}", mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };
 
-} // namespace
-
-
-namespace cv
-{
     Formatted::~Formatted() {}
     Formatter::~Formatter() {}
 

From 723c317108a4a1e36f2dc098dab0cf111d46f2c2 Mon Sep 17 00:00:00 2001
From: berak <px1704@web.de>
Date: Tue, 14 Apr 2015 19:01:48 +0200
Subject: [PATCH 11/48] changed create function in tutorial

---
 doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown b/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown
index 5ef338015..7d9a1258a 100644
--- a/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown
+++ b/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown
@@ -44,7 +44,7 @@ from matplotlib import pyplot as plt
 imgL = cv2.imread('tsukuba_l.png',0)
 imgR = cv2.imread('tsukuba_r.png',0)
 
-stereo = cv2.createStereoBM(numDisparities=16, blockSize=15)
+stereo = cv2.StereoBM_create(numDisparities=16, blockSize=15)
 disparity = stereo.compute(imgL,imgR)
 plt.imshow(disparity,'gray')
 plt.show()

From a2bba1b9e6771b8b27906bf9e3fcea6f2cd3bd2c Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Wed, 15 Apr 2015 00:36:27 +0300
Subject: [PATCH 12/48] HAL extensions: added initial version of universal
 intrinsics (C implementation and SSE2, NEON - TBD). added empty files where
 some functionality from core and imgproc will be moved to

---
 modules/core/include/opencv2/core/base.hpp    |  131 -
 modules/core/include/opencv2/core/cvdef.h     |  181 --
 modules/core/include/opencv2/core/private.hpp |    8 -
 modules/core/include/opencv2/core/types_c.h   |   16 -
 modules/hal/include/opencv2/hal.hpp           |   45 +-
 modules/hal/include/opencv2/hal/defs.h        |  391 ++-
 modules/hal/include/opencv2/hal/intrin.hpp    | 2254 +++++++++++++++++
 modules/hal/src/arithm.cpp                    |   47 +
 modules/hal/src/color.cpp                     |   47 +
 modules/hal/src/filter.cpp                    |   47 +
 modules/hal/src/mathfuncs.cpp                 |   47 +
 modules/hal/src/matrix.cpp                    |   47 +
 modules/hal/src/precomp.hpp                   |   42 +
 modules/hal/src/resize.cpp                    |   47 +
 modules/hal/src/{norm.cpp => stat.cpp}        |    0
 modules/hal/src/warp.cpp                      |   47 +
 16 files changed, 3055 insertions(+), 342 deletions(-)
 create mode 100644 modules/hal/include/opencv2/hal/intrin.hpp
 create mode 100644 modules/hal/src/arithm.cpp
 create mode 100644 modules/hal/src/color.cpp
 create mode 100644 modules/hal/src/filter.cpp
 create mode 100644 modules/hal/src/mathfuncs.cpp
 create mode 100644 modules/hal/src/matrix.cpp
 create mode 100644 modules/hal/src/resize.cpp
 rename modules/hal/src/{norm.cpp => stat.cpp} (100%)
 create mode 100644 modules/hal/src/warp.cpp

diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 08a6642fb..83661a2fd 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -400,136 +400,6 @@ configurations while CV_DbgAssert is only retained in the Debug configuration.
 #  define CV_DbgAssert(expr)
 #endif
 
-
-/////////////// saturate_cast (used in image & signal processing) ///////////////////
-
-/**
-Template function for accurate conversion from one primitive type to another.
-
-The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
-and others. They perform an efficient and accurate conversion from one primitive type to another
-(see the introduction chapter). saturate in the name means that when the input value v is out of the
-range of the target type, the result is not formed just by taking low bits of the input, but instead
-the value is clipped. For example:
-@code
-    uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
-    short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
-@endcode
-Such clipping is done when the target type is unsigned char , signed char , unsigned short or
-signed short . For 32-bit integers, no clipping is done.
-
-When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
-the floating-point value is first rounded to the nearest integer and then clipped if needed (when
-the target type is 8- or 16-bit).
-
-This operation is used in the simplest or most complex image processing functions in OpenCV.
-
-@param v Function parameter.
-@sa add, subtract, multiply, divide, Mat::convertTo
-*/
-template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
-
-//! @cond IGNORED
-
-template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
-template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
-template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
-
-template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
-template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
-
-template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
-template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
-
-template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
-template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
-template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
-
-template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
-template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
-
-// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
-template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
-template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
-
-//! @endcond
-
-//////////////////////////////// low-level functions ////////////////////////////////
-
-CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-
-CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
-CV_EXPORTS float normL1_(const float* a, const float* b, int n);
-CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
-
-CV_EXPORTS void exp(const float* src, float* dst, int n);
-CV_EXPORTS void log(const float* src, float* dst, int n);
-
-CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
-CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
-
-/** @brief Computes the cube root of an argument.
-
-The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
-NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
-single-precision data.
-@param val A function argument.
- */
-CV_EXPORTS_W float cubeRoot(float val);
-
-/** @brief Calculates the angle of a 2D vector in degrees.
-
-The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
-in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
-@param x x-coordinate of the vector.
-@param y y-coordinate of the vector.
- */
-CV_EXPORTS_W float fastAtan2(float y, float x);
-
 /*
  * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
  * bit count of A exclusive XOR'ed with B
@@ -549,7 +419,6 @@ typedef Hamming HammingLUT;
 
 /////////////////////////////////// inline norms ////////////////////////////////////
 
-
 template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, int n)
 {
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index af0a271a4..1d933b5c3 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -70,16 +70,6 @@
 #  define CV_EXPORTS
 #endif
 
-#ifndef CV_INLINE
-#  if defined __cplusplus
-#    define CV_INLINE static inline
-#  elif defined _MSC_VER
-#    define CV_INLINE __inline
-#  else
-#    define CV_INLINE static
-#  endif
-#endif
-
 #ifndef CV_EXTERN_C
 #  ifdef __cplusplus
 #    define CV_EXTERN_C extern "C"
@@ -186,19 +176,6 @@
 #define CV_ELEM_SIZE(type) \
     (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
 
-
-/****************************************************************************************\
-*                                      fast math                                         *
-\****************************************************************************************/
-
-#if defined __BORLANDC__
-#  include <fastmath.h>
-#elif defined __cplusplus
-#  include <cmath>
-#else
-#  include <math.h>
-#endif
-
 #ifndef MIN
 #  define MIN(a,b)  ((a) > (b) ? (b) : (a))
 #endif
@@ -207,164 +184,6 @@
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif
 
-#ifdef HAVE_TEGRA_OPTIMIZATION
-#  include "tegra_round.hpp"
-#endif
-
-//! @addtogroup core_utils
-//! @{
-
-#if CV_VFP
-// 1. general scheme
-#define ARM_ROUND(_value, _asm_string) \
-    int res; \
-    float temp; \
-    asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
-    return res;
-// 2. version for double
-#ifdef __clang__
-#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
-#else
-#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
-#endif
-// 3. version for float
-#define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
-#endif // CV_VFP
-
-/** @brief Rounds floating-point number to the nearest integer
-
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
- */
-CV_INLINE int cvRound( double value )
-{
-#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    return _mm_cvtsd_si32(t);
-#elif defined _MSC_VER && defined _M_IX86
-    int t;
-    __asm
-    {
-        fld value;
-        fistp t;
-    }
-    return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_DBL(value);
-#elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
-    ARM_ROUND_DBL(value)
-# else
-    return (int)lrint(value);
-# endif
-#else
-    double intpart, fractpart;
-    fractpart = modf(value, &intpart);
-    if ((fabs(fractpart) != 0.5) || ((((int)intpart) % 2) != 0))
-        return (int)(value + (value >= 0 ? 0.5 : -0.5));
-    else
-        return (int)intpart;
-#endif
-}
-
-#ifdef __cplusplus
-
-/** @overload */
-CV_INLINE int cvRound(float value)
-{
-#if defined ANDROID && (defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_FLT(value);
-#elif CV_VFP && !defined HAVE_TEGRA_OPTIMIZATION
-    ARM_ROUND_FLT(value)
-#else
-    return cvRound((double)value);
-#endif
-}
-
-/** @overload */
-CV_INLINE int cvRound(int value)
-{
-    return value;
-}
-
-#endif // __cplusplus
-
-/** @brief Rounds floating-point number to the nearest integer not larger than the original.
-
-The function computes an integer i such that:
-\f[i \le \texttt{value} < i+1\f]
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
- */
-CV_INLINE int cvFloor( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
-}
-
-/** @brief Rounds floating-point number to the nearest integer not larger than the original.
-
-The function computes an integer i such that:
-\f[i \le \texttt{value} < i+1\f]
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
-*/
-CV_INLINE int cvCeil( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
-}
-
-/** @brief Determines if the argument is Not A Number.
-
-@param value The input floating-point value
-
-The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
-otherwise. */
-CV_INLINE int cvIsNaN( double value )
-{
-    union { uint64 u; double f; } ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
-           ((unsigned)ieee754.u != 0) > 0x7ff00000;
-}
-
-/** @brief Determines if the argument is Infinity.
-
-@param value The input floating-point value
-
-The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
-and 0 otherwise. */
-CV_INLINE int cvIsInf( double value )
-{
-    union { uint64 u; double f; } ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
-           (unsigned)ieee754.u == 0;
-}
-
-//! @} core_utils
-
 /****************************************************************************************\
 *          exchange-add operation for atomic operations on reference counters            *
 \****************************************************************************************/
diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp
index 58d78e584..4f9f48777 100644
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -136,14 +136,6 @@ namespace cv
 /* the alignment of all the allocated buffers */
 #define  CV_MALLOC_ALIGN    16
 
-#ifdef __GNUC__
-#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
-#elif defined _MSC_VER
-#  define CV_DECL_ALIGNED(x) __declspec(align(x))
-#else
-#  define CV_DECL_ALIGNED(x)
-#endif
-
 /* IEEE754 constants and macros */
 #define  CV_TOGGLE_FLT(x) ((x)^((int)(x) < 0 ? 0x7fffffff : 0))
 #define  CV_TOGGLE_DBL(x) ((x)^((int64)(x) < 0 ? CV_BIG_INT(0x7fffffffffffffff) : 0))
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index 16e613053..cb39587a9 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -113,22 +113,6 @@ bytes of the header. In C++ interface the role of CvArr is played by InputArray
  */
 typedef void CvArr;
 
-typedef union Cv32suf
-{
-    int i;
-    unsigned u;
-    float f;
-}
-Cv32suf;
-
-typedef union Cv64suf
-{
-    int64 i;
-    uint64 u;
-    double f;
-}
-Cv64suf;
-
 typedef int CVStatus;
 
 /** @see cv::Error::Code */
diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp
index d0e135080..7641c46ca 100644
--- a/modules/hal/include/opencv2/hal.hpp
+++ b/modules/hal/include/opencv2/hal.hpp
@@ -55,7 +55,7 @@ namespace cv { namespace hal {
 
 namespace Error {
 
-enum Code
+enum
 {
     Ok = 0,
     Unknown = -1
@@ -63,11 +63,46 @@ enum Code
 
 }
 
-Error::Code normHamming(const uchar* a, int n, int & result);
-Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result);
+int normHamming(const uchar* a, int n);
+int normHamming(const uchar* a, const uchar* b, int n);
 
-Error::Code normHamming(const uchar* a, int n, int cellSize, int & result);
-Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int & result);
+int normHamming(const uchar* a, int n, int cellSize);
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
+
+//////////////////////////////// low-level functions ////////////////////////////////
+
+int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+int normL1_(const uchar* a, const uchar* b, int n);
+float normL1_(const float* a, const float* b, int n);
+float normL2Sqr_(const float* a, const float* b, int n);
+
+void exp(const float* src, float* dst, int n);
+void log(const float* src, float* dst, int n);
+
+void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+void magnitude(const float* x, const float* y, float* dst, int n);
+
+/** @brief Computes the cube root of an argument.
+
+ The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
+ NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
+ single-precision data.
+ @param val A function argument.
+ */
+float cubeRoot(float val);
+
+/** @brief Calculates the angle of a 2D vector in degrees.
+
+ The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
+ in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
+ @param x x-coordinate of the vector.
+ @param y y-coordinate of the vector.
+ */
+float fastAtan2(float y, float x);
 
 }} //cv::hal
 
diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h
index 96595961f..6e1ff2a0a 100644
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -1,3 +1,4 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -48,6 +49,8 @@
 #  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
 #endif
 
+#include <limits.h>
+
 #if defined __ICL
 #  define CV_ICC   __ICL
 #elif defined __ICC
@@ -60,12 +63,30 @@
 #  define CV_ICC   __INTEL_COMPILER
 #endif
 
+#ifndef CV_INLINE
+#  if defined __cplusplus
+#    define CV_INLINE static inline
+#  elif defined _MSC_VER
+#    define CV_INLINE __inline
+#  else
+#    define CV_INLINE static
+#  endif
+#endif
+
 #if defined CV_ICC && !defined CV_ENABLE_UNROLLED
 #  define CV_ENABLE_UNROLLED 0
 #else
 #  define CV_ENABLE_UNROLLED 1
 #endif
 
+#ifdef __GNUC__
+#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined _MSC_VER
+#  define CV_DECL_ALIGNED(x) __declspec(align(x))
+#else
+#  define CV_DECL_ALIGNED(x)
+#endif
+
 /* CPU features and intrinsics support */
 #define CV_CPU_NONE             0
 #define CV_CPU_MMX              1
@@ -99,7 +120,7 @@
 // do not include SSE/AVX/NEON headers for NVCC compiler
 #ifndef __CUDACC__
 
-#if defined __SSE2__ || defined _M_X64  || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
 #  include <emmintrin.h>
 #  define CV_MMX 1
 #  define CV_SSE 1
@@ -281,4 +302,372 @@ typedef signed char schar;
 #define CV_2PI 6.283185307179586476925286766559
 #define CV_LOG2 0.69314718055994530941723212145818
 
+typedef union Cv32suf
+{
+    int i;
+    unsigned u;
+    float f;
+}
+Cv32suf;
+
+typedef union Cv64suf
+{
+    int64 i;
+    uint64 u;
+    double f;
+}
+Cv64suf;
+
+
+/****************************************************************************************\
+*                                      fast math                                         *
+\****************************************************************************************/
+
+#if defined __BORLANDC__
+#  include <fastmath.h>
+#elif defined __cplusplus
+#  include <cmath>
+#else
+#  include <math.h>
+#endif
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+#  include "tegra_round.hpp"
+#endif
+
+//! @addtogroup core_utils
+//! @{
+
+#if CV_VFP
+    // 1. general scheme
+    #define ARM_ROUND(_value, _asm_string) \
+        int res; \
+        float temp; \
+        asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
+        return res
+    // 2. version for double
+    #ifdef __clang__
+        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
+    #else
+        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
+    #endif
+    // 3. version for float
+    #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+#endif // CV_VFP
+
+/** @brief Rounds floating-point number to the nearest integer
+
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int
+cvRound( double value )
+{
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    return _mm_cvtsd_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND_DBL(value);
+#elif defined CV_ICC || defined __GNUC__
+# if CV_VFP
+    ARM_ROUND_DBL(value)
+# else
+    return (int)lrint(value);
+# endif
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+       the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5 : -0.5));
+#endif
+}
+
+
+/** @brief Rounds floating-point number to the nearest integer not larger than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvFloor( double value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    int i = _mm_cvtsd_si32(t);
+    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i - (i > value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(value - i);
+    return i - (diff < 0);
+#endif
+}
+
+/** @brief Rounds floating-point number to the nearest integer not larger than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvCeil( double value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    int i = _mm_cvtsd_si32(t);
+    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i + (i < value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(i - value);
+    return i + (diff < 0);
+#endif
+}
+
+/** @brief Determines if the argument is Not A Number.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
+ otherwise. */
+CV_INLINE int cvIsNaN( double value )
+{
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
+           ((unsigned)ieee754.u != 0) > 0x7ff00000;
+}
+
+/** @brief Determines if the argument is Infinity.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
+ and 0 otherwise. */
+CV_INLINE int cvIsInf( double value )
+{
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
+            (unsigned)ieee754.u == 0;
+}
+
+#ifdef __cplusplus
+
+/** @overload */
+CV_INLINE int cvRound(float value)
+{
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \
+      defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    return _mm_cvtss_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND_FLT(value);
+#elif defined CV_ICC || defined __GNUC__
+# if CV_VFP
+    ARM_ROUND_FLT(value)
+# else
+    return (int)lrintf(value);
+# endif
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+     the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5f : -0.5f));
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvRound( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvFloor( float value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    int i = _mm_cvtss_si32(t);
+    return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i)));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i - (i > value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(value - i);
+    return i - (diff < 0);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvFloor( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvCeil( float value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    int i = _mm_cvtss_si32(t);
+    return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i + (i < value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(i - value);
+    return i + (diff < 0);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvCeil( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvIsNaN( float value )
+{
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) > 0x7f800000;
+}
+
+/** @overload */
+CV_INLINE int cvIsInf( float value )
+{
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) == 0x7f800000;
+}
+
+#include <algorithm>
+
+namespace cv
+{
+
+/////////////// saturate_cast (used in image & signal processing) ///////////////////
+
+/**
+ Template function for accurate conversion from one primitive type to another.
+
+ The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
+ and others. They perform an efficient and accurate conversion from one primitive type to another
+ (see the introduction chapter). saturate in the name means that when the input value v is out of the
+ range of the target type, the result is not formed just by taking low bits of the input, but instead
+ the value is clipped. For example:
+ @code
+ uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
+ short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
+ @endcode
+ Such clipping is done when the target type is unsigned char , signed char , unsigned short or
+ signed short . For 32-bit integers, no clipping is done.
+
+ When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
+ the floating-point value is first rounded to the nearest integer and then clipped if needed (when
+ the target type is 8- or 16-bit).
+
+ This operation is used in the simplest or most complex image processing functions in OpenCV.
+
+ @param v Function parameter.
+ @sa add, subtract, multiply, divide, Mat::convertTo
+ */
+template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
+
+//! @cond IGNORED
+
+template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
+template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
+template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
+
+template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
+template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
+
+template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
+template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
+
+template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
+template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
+template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
+
+template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
+template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
+
+// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
+template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
+
+//! @endcond
+
+}
+
+#endif // __cplusplus
+
 #endif //__OPENCV_HAL_H__
diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
new file mode 100644
index 000000000..b7b147a19
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -0,0 +1,2254 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_INTRIN_HPP__
+#define __OPENCV_HAL_INTRIN_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <stdlib.h>
+
+#define OPENCV_HAL_ADD(a, b) ((a) + (b))
+#define OPENCV_HAL_AND(a, b) ((a) & (b))
+#define OPENCV_HAL_NOP(a) (a)
+#define OPENCV_HAL_1ST(a, b) (a)
+
+namespace cv { namespace hal {
+
+template<typename _Tp> struct TypeTraits
+{
+    typedef _Tp int_type;
+    typedef _Tp uint_type;
+    typedef _Tp abs_type;
+    typedef _Tp sum_type;
+
+    enum { delta = 0, shift = 0 };
+
+    static int_type reinterpret_int(_Tp x) { return x; }
+    static uint_type reinterpet_uint(_Tp x) { return x; }
+    static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
+};
+
+template<> struct TypeTraits<uchar>
+{
+    typedef uchar value_type;
+    typedef schar int_type;
+    typedef uchar uint_type;
+    typedef uchar abs_type;
+    typedef int sum_type;
+
+    typedef ushort w_type;
+
+    enum { delta = 128, shift = 8 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct TypeTraits<schar>
+{
+    typedef schar value_type;
+    typedef schar int_type;
+    typedef uchar uint_type;
+    typedef uchar abs_type;
+    typedef int sum_type;
+
+    typedef short w_type;
+
+    enum { delta = 128, shift = 8 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct TypeTraits<ushort>
+{
+    typedef ushort value_type;
+    typedef short int_type;
+    typedef ushort uint_type;
+    typedef ushort abs_type;
+    typedef int sum_type;
+
+    typedef unsigned w_type;
+    typedef uchar nu_type;
+
+    enum { delta = 32768, shift = 16 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct TypeTraits<short>
+{
+    typedef short value_type;
+    typedef short int_type;
+    typedef ushort uint_type;
+    typedef ushort abs_type;
+    typedef int sum_type;
+
+    typedef int w_type;
+    typedef uchar nu_type;
+    typedef schar n_type;
+
+    enum { delta = 128, shift = 8 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct TypeTraits<unsigned>
+{
+    typedef unsigned value_type;
+    typedef int int_type;
+    typedef unsigned uint_type;
+    typedef unsigned abs_type;
+    typedef unsigned sum_type;
+
+    typedef ushort nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct TypeTraits<int>
+{
+    typedef int value_type;
+    typedef int int_type;
+    typedef unsigned uint_type;
+    typedef unsigned abs_type;
+    typedef int sum_type;
+
+    typedef short n_type;
+    typedef ushort nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct TypeTraits<float>
+{
+    typedef float value_type;
+    typedef int int_type;
+    typedef unsigned uint_type;
+    typedef float abs_type;
+    typedef float sum_type;
+
+    typedef double w_type;
+
+    static int_type reinterpret_int(value_type x)
+    {
+        Cv32suf u;
+        u.f = x;
+        return u.i;
+    }
+    static uint_type reinterpet_uint(value_type x)
+    {
+        Cv32suf u;
+        u.f = x;
+        return u.u;
+    }
+    static value_type reinterpret_from_int(int_type x)
+    {
+        Cv32suf u;
+        u.i = x;
+        return u.f;
+    }
+};
+
+template<> struct TypeTraits<double>
+{
+    typedef double value_type;
+    typedef int64 int_type;
+    typedef uint64 uint_type;
+    typedef double abs_type;
+    typedef double sum_type;
+    static int_type reinterpret_int(value_type x)
+    {
+        Cv64suf u;
+        u.f = x;
+        return u.i;
+    }
+    static uint_type reinterpet_uint(value_type x)
+    {
+        Cv64suf u;
+        u.f = x;
+        return u.u;
+    }
+    static value_type reinterpret_from_int(int_type x)
+    {
+        Cv64suf u;
+        u.i = x;
+        return u.f;
+    }
+};
+
+template<typename _Tp, int n> struct v_reg
+{
+    typedef _Tp scalar_type;
+    typedef v_reg<typename TypeTraits<_Tp>::int_type, n> int_vec;
+    typedef v_reg<typename TypeTraits<_Tp>::abs_type, n> abs_vec;
+    enum { channels = n };
+
+    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
+    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+    }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
+           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
+           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
+        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
+    }
+
+    v_reg() {}
+    v_reg(const v_reg<_Tp, n> & r)
+    {
+        for( int i = 0; i < n; i++ )
+            s[i] = r.s[i];
+    }
+
+    _Tp get(const int i) const { return s[i]; }
+    _Tp get0() const { return s[0]; }
+    v_reg<_Tp, n> high() const
+    {
+        v_reg<_Tp, n> c;
+        int i;
+        for( i = 0; i < n/2; i++ )
+        {
+            c.s[i] = s[i+(n/2)];
+            c.s[i+(n/2)] = 0;
+        }
+        return c;
+    }
+
+    static v_reg<_Tp, n> zero()
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = (_Tp)0;
+        return c;
+    }
+
+    static v_reg<_Tp, n> all(_Tp s)
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = s;
+        return c;
+    }
+
+    template<typename _Tp2, int n2> static v_reg<_Tp2, n2> reinterpret_as(const v_reg<_Tp, n>& a)
+    {
+        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
+        v_reg<_Tp2, n2> c;
+        memcpy(&c.s[0], &a.s[0], bytes);
+        return c;
+    }
+
+    _Tp s[n];
+};
+
+#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return c; \
+} \
+template<typename _Tp, int n> inline v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_BIN_OP(+)
+OPENCV_HAL_IMPL_BIN_OP(-)
+OPENCV_HAL_IMPL_BIN_OP(*)
+OPENCV_HAL_IMPL_BIN_OP(/)
+
+#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    typedef typename TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)(TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return c; \
+} \
+template<typename _Tp, int n> inline v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)(TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_BIT_OP(&)
+OPENCV_HAL_IMPL_BIT_OP(|)
+OPENCV_HAL_IMPL_BIT_OP(^)
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    typedef typename TypeTraits<_Tp>::int_type itype;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = TypeTraits<_Tp>::reinterpret_from_int(~TypeTraits<_Tp>::reinterpret_int(a.s[i]));
+        return c;
+}
+
+#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
+template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp2, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename TypeTraits<_Tp>::abs_type)std::abs, typename TypeTraits<_Tp>::abs_type)
+OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
+
+#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i], b.s[i]); \
+    return c; \
+} \
+template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
+{ \
+    _Tp c = a.s[0]; \
+    for( int i = 1; i < n; i++ ) \
+        c = cfunc(c, a.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min)
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max)
+
+template<typename _Tp, int n> inline void v_minmax(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                                                   v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        minval.s[i] = std::min(a.s[i], b.s[i]);
+        maxval.s[i] = std::max(a.s[i], b.s[i]);
+    }
+}
+
+
+#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> \
+    operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename TypeTraits<_Tp>::int_type itype; \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_CMP_OP(<)
+OPENCV_HAL_IMPL_CMP_OP(>)
+OPENCV_HAL_IMPL_CMP_OP(<=)
+OPENCV_HAL_IMPL_CMP_OP(>=)
+OPENCV_HAL_IMPL_CMP_OP(==)
+OPENCV_HAL_IMPL_CMP_OP(!=)
+
+#define OPENCV_HAL_IMPL_ADDSUB_OP(func, bin_op, cast_op, _Tp2) \
+template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef _Tp2 rtype; \
+    v_reg<rtype, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_ADDSUB_OP(v_add_wrap, +, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ADDSUB_OP(v_sub_wrap, -, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ADDSUB_OP(v_absdiff, -, (rtype)std::abs, typename TypeTraits<_Tp>::abs_type)
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                                                            const v_reg<_Tp, n>& c)
+{
+    v_reg<_Tp, n> d;
+    for( int i = 0; i < n; i++ )
+        d.s[i] = a.s[i]*b.s[i] + c.s[i];
+    return d;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_mullo(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (_Tp)(a.s[i]*b.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_mulhi2(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (_Tp)((a.s[i]*b.s[i]*2 + TypeTraits<_Tp>::delta) >> TypeTraits<_Tp>::shift);
+    return c;
+}
+
+#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_SHIFT_OP(<<)
+OPENCV_HAL_IMPL_SHIFT_OP(>>)
+
+template<typename _Tp, int n> inline typename TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
+{
+    typename TypeTraits<_Tp>::sum_type c = a.s[0];
+    for( int i = 1; i < n; i++ )
+        c += a.s[i];
+    return c;
+}
+
+template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
+{
+    int mask = 0;
+    for( int i = 0; i < n; i++ )
+        mask |= (TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
+    return mask;
+}
+
+template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
+            return false;
+    return true;
+}
+
+template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
+            return true;
+    return false;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
+                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i];
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
+                            v_reg<typename TypeTraits<_Tp>::w_type, n/2>& b0,
+                            v_reg<typename TypeTraits<_Tp>::w_type, n/2>& b1)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        b0.s[i] = a.s[i];
+        b1.s[i] = a.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<typename TypeTraits<_Tp>::int_type, n>
+    v_reinterpret_int(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename TypeTraits<_Tp>::int_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = TypeTraits<_Tp>::reinterpret_int(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename TypeTraits<_Tp>::uint_type, n>
+    v_reinterpret_uint(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename TypeTraits<_Tp>::uint_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
+{
+    int i;
+    for( i = 0; i < n/2; i++ )
+    {
+        b0.s[i*2] = a0.s[i];
+        b0.s[i*2+1] = a1.s[i];
+    }
+    for( ; i < n; i++ )
+    {
+        b1.s[i*2-n] = a0.s[i];
+        b1.s[i*2-n+1] = a1.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr)
+{
+    return v_reg<_Tp, n>(ptr);
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr)
+{
+    return v_reg<_Tp, n>(ptr);
+}
+
+template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n/2; i++ )
+    {
+        c.s[i] = loptr[i];
+        c.s[i+n/2] = hiptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr)
+{
+    typedef typename TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename
+    TypeTraits<typename TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr)
+{
+    typedef typename TypeTraits<typename TypeTraits<_Tp>::w_type>::w_type w_type;
+    v_reg<w_type, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
+{
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        a.s[i] = ptr[i3];
+        b.s[i] = ptr[i3+1];
+        c.s[i] = ptr[i3+2];
+    }
+}
+
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
+                                                            v_reg<_Tp, n>& d)
+{
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        a.s[i] = ptr[i4];
+        b.s[i] = ptr[i4+1];
+        c.s[i] = ptr[i4+2];
+        d.s[i] = ptr[i4+3];
+    }
+}
+
+template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+{
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        ptr[i3] = a.s[i];
+        ptr[i3+1] = b.s[i];
+        ptr[i3+2] = c.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                                            const v_reg<_Tp, n>& d)
+{
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        ptr[i4] = a.s[i];
+        ptr[i4+1] = b.s[i];
+        ptr[i4+2] = c.s[i];
+        ptr[i4+3] = d.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n> inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n> inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i+(n/2)];
+}
+
+template<typename _Tp, int n> inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i];
+        c.s[i+(n/2)] = b.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i+(n/2)];
+        c.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n> inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                                                      v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        low.s[i] = a.s[i];
+        low.s[i+(n/2)] = b.s[i];
+        high.s[i] = a.s[i+(n/2)];
+        high.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvRound(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvFloor(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvCeil(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (int)(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvFloor(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (float)a.s[i];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+template<typename _Tp, int n, typename _Tp2> inline v_reg<_Tp2, n*2> v_cvtsat(const v_reg<_Tp, n>& a,
+                                                                              const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp2, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = saturate_cast<_Tp2>(a.s[i]);
+        c.s[i+n] = saturate_cast<_Tp2>(b.s[i]);
+    }
+    return c;
+}
+
+template<typename _Tp, int n, typename _Tp2> inline v_reg<_Tp2, n*2> v_cvtsat(const v_reg<_Tp, n>& a,
+                                                                              const v_reg<_Tp, n>& b,
+                                                                              int rshift)
+{
+    v_reg<_Tp2, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = saturate_cast<_Tp2>((a.s[i] + (1<<(rshift-1))) >> rshift);
+        c.s[i+n] = saturate_cast<_Tp2>((b.s[i] + (1<<(rshift-1))) >> rshift);
+    }
+    return c;
+}
+
+template<typename _Tp, int n, typename _Tp2> inline void v_storesat(_Tp2* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        ptr[i] = saturate_cast<_Tp2>(a.s[i]);
+    }
+}
+
+template<typename _Tp, int n, typename _Tp2> inline void v_storesat(_Tp2* ptr, const v_reg<_Tp, n>& a, int rshift)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        ptr[i] = saturate_cast<_Tp2>((a.s[i] + (1<<(rshift-1))) >> rshift);
+    }
+}
+
+template<typename _Tp> inline void v_transpose4x4(const v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
+                                                  const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
+                                                  v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
+                                                  v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3)
+{
+    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
+    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
+    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
+    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
+}
+
+#if CV_SSE2
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+
+struct v_uint8x16
+{
+    explicit v_uint8x16(__m128i v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+    uchar get(const int i) const
+    {
+        return (uchar)(_mm_extract_epi16(val, i/2) >> ((i&1)*8));
+    }
+    uchar get0() const
+    {
+        return (uchar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int8x16
+{
+    explicit v_int8x16(__m128i v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+              schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+    schar get(const int i) const
+    {
+        return (schar)(_mm_extract_epi16(val, i/2) >> ((i&1)*8));
+    }
+    schar get0() const
+    {
+        return (schar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_uint16x8
+{
+    explicit v_uint16x8(__m128i v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+    ushort get(const int i) const
+    {
+        return (ushort)_mm_extract_epi16(val, i);
+    }
+    uchar get0() const
+    {
+        return (ushort)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int16x8
+{
+    explicit v_int16x8(__m128i v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+    short get(const int i) const
+    {
+        return (short)_mm_extract_epi16(val, i);
+    }
+    short get0() const
+    {
+        return (short)_mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+
+struct v_uint32x4
+{
+    explicit v_uint32x4(__m128i v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
+    }
+    unsigned get(const int i) const
+    {
+        unsigned CV_DECL_ALIGNED(16) buf[4];
+        _mm_store_si128((__m128i*)buf, val);
+        return buf[i];
+    }
+    unsigned get0() const
+    {
+        return (unsigned)_mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+
+struct v_int32x4
+{
+    explicit v_int32x4(__m128i v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        val = _mm_setr_epi32(v0, v1, v2, v3);
+    }
+    int get(int i) const
+    {
+        int CV_DECL_ALIGNED(16) buf[4];
+        _mm_store_si128((__m128i*)buf, val);
+        return buf[i];
+    }
+    int get0() const
+    {
+        return _mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+
+struct v_float32x4
+{
+    explicit v_float32x4(__m128 v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        val = _mm_setr_ps(v0, v1, v2, v3);
+    }
+    float get(int i) const
+    {
+        float CV_DECL_ALIGNED(16) buf[4];
+        _mm_store_ps(buf, val);
+        return buf[i];
+    }
+    float get0() const
+    {
+        return _mm_cvtss_f32(val);
+    }
+    __m128 val;
+};
+
+struct v_float64x2
+{
+    explicit v_float64x2(__m128d v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        val = _mm_setr_pd(v0, v1);
+    }
+    double get(int i) const
+    {
+        double CV_DECL_ALIGNED(16) buf[2];
+        _mm_store_pd(buf, val);
+        return buf[i];
+    }
+    double get0() const
+    {
+        return _mm_cvtsd_f64(val);
+    }
+    __m128d val;
+};
+
+inline v_uint8x16 v_setzero_u8() { return v_uint8x16(_mm_setzero_si128()); }
+inline v_int8x16 v_setzero_s8() { return v_int8x16(_mm_setzero_si128()); }
+inline v_uint16x8 v_setzero_u16() { return v_uint16x8(_mm_setzero_si128()); }
+inline v_int16x8 v_setzero_s16() { return v_int16x8(_mm_setzero_si128()); }
+inline v_uint32x4 v_setzero_u32() { return v_uint32x4(_mm_setzero_si128()); }
+inline v_int32x4 v_setzero_s32() { return v_int32x4(_mm_setzero_si128()); }
+inline v_float32x4 v_setzero_f32() { return v_float32x4(_mm_setzero_ps()); }
+inline v_float64x2 v_setzero_f64() { return v_float64x2(_mm_setzero_pd()); }
+
+inline v_uint8x16 v_setall_u8(uchar v) { return v_uint8x16(_mm_set1_epi8((char)v)); }
+inline v_int8x16 v_setall_s8(schar v) { return v_int8x16(_mm_set1_epi8((char)v)); }
+inline v_uint16x8 v_setall_u16(ushort v) { return v_uint16x8(_mm_set1_epi16((short)v)); }
+inline v_int16x8 v_setall_s16(short v) { return v_int16x8(_mm_set1_epi16((short)v)); }
+inline v_uint32x4 v_setall_u32(unsigned v) { return v_uint32x4(_mm_set1_epi32((int)v)); }
+inline v_int32x4 v_setall_s32(int v) { return v_int32x4(_mm_set1_epi32(v)); }
+inline v_float32x4 v_setall_f32(float v) { return v_float32x4(_mm_set1_ps(v)); }
+inline v_float64x2 v_setall_f64(double v) { return v_float64x2(_mm_set1_pd(v)); }
+
+template<typename _Tpvec> inline v_uint8x16 v_reinterpret_u8(const _Tpvec& a)
+{ return v_uint8x16(a.val); }
+
+inline v_uint8x16 v_reinterpret_u8(const v_float32x4& a)
+{ return v_uint8x16(_mm_castps_si128(a.val)); }
+
+inline v_uint8x16 v_reinterpret_u8(const v_float64x2& a)
+{ return v_uint8x16(_mm_castpd_si128(a.val)); }
+
+template<typename _Tpvec> inline v_int8x16 v_reinterpret_s8(const _Tpvec& a)
+{ return v_int8x16(a.val); }
+
+inline v_int8x16 v_reinterpret_s8(const v_float32x4& a)
+{ return v_int8x16(_mm_castps_si128(a.val)); }
+
+inline v_int8x16 v_reinterpret_s8(const v_float64x2& a)
+{ return v_int8x16(_mm_castpd_si128(a.val)); }
+
+template<typename _Tpvec> inline v_uint16x8 v_reinterpret_u16(const _Tpvec& a)
+{ return v_uint16x8(a.val); }
+
+inline v_uint16x8 v_reinterpret_u16(const v_float32x4& a)
+{ return v_uint16x8(_mm_castps_si128(a.val)); }
+
+inline v_uint16x8 v_reinterpret_u16(const v_float64x2& a)
+{ return v_uint16x8(_mm_castpd_si128(a.val)); }
+
+template<typename _Tpvec> inline v_int16x8 v_reinterpret_s16(const _Tpvec& a)
+{ return v_int16x8(a.val); }
+
+inline v_int16x8 v_reinterpret_s16(const v_float32x4& a)
+{ return v_int16x8(_mm_castps_si128(a.val)); }
+
+inline v_int16x8 v_reinterpret_s16(const v_float64x2& a)
+{ return v_int16x8(_mm_castpd_si128(a.val)); }
+
+template<typename _Tpvec> inline v_uint32x4 v_reinterpret_u32(const _Tpvec& a)
+{ return v_uint32x4(a.val); }
+
+inline v_uint32x4 v_reinterpret_u32(const v_float32x4& a)
+{ return v_uint32x4(_mm_castps_si128(a.val)); }
+
+inline v_uint32x4 v_reinterpret_u32(const v_float64x2& a)
+{ return v_uint32x4(_mm_castpd_si128(a.val)); }
+
+template<typename _Tpvec> inline v_int32x4 v_reinterpret_s32(const _Tpvec& a)
+{ return v_int32x4(a.val); }
+
+inline v_int32x4 v_reinterpret_s32(const v_float32x4& a)
+{ return v_int32x4(_mm_castps_si128(a.val)); }
+
+inline v_int32x4 v_reinterpret_s32(const v_float64x2& a)
+{ return v_int32x4(_mm_castpd_si128(a.val)); }
+
+template<typename _Tpvec> inline v_float32x4 v_reinterpret_f32(const _Tpvec& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+
+inline v_float32x4 v_reinterpret_f32(const v_float64x2& a)
+{ return v_float32x4(_mm_castpd_ps(a.val)); }
+
+template<typename _Tpvec> inline v_float64x2 v_reinterpret_f64(const _Tpvec& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+
+inline v_float64x2 v_reinterpret_f64(const v_float64x2& a)
+{ return v_float64x2(_mm_castps_pd(a.val)); }
+
+inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    return v_uint8x16(_mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta),
+                                       _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta)));
+}
+inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n),
+                                       _mm_srli_epi16(_mm_add_epi16(b.val, delta), n)));
+}
+
+inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
+inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b, int n)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n),
+                                       _mm_srai_epi16(_mm_add_epi16(b.val, delta), n)));
+}
+
+inline void v_storesat_u8(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    _mm_storel_epi64((__m128i*)ptr,
+                     _mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta));
+}
+
+inline void v_storesat_u8(uchar* ptr, const v_uint16x8& a, int n)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    _mm_storel_epi64((__m128i*)ptr,
+                     _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta));
+}
+
+inline void v_storesat_u8(uchar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
+
+inline void v_storesat_u8(uchar* ptr, const v_int16x8& a, int n)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    _mm_storel_epi64((__m128i*)ptr,
+        _mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta));
+}
+
+inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i delta = _mm_set1_epi16(127);
+    return v_int8x16(_mm_packs_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta),
+                                       _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta)));
+}
+
+inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b, int n)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_int8x16(_mm_packs_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n),
+                                       _mm_srli_epi16(_mm_add_epi16(b.val, delta), n)));
+}
+
+inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
+
+inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b, int n)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n),
+                                       _mm_srai_epi16(_mm_add_epi16(b.val, delta), n)));
+}
+
+inline void v_storesat_s8(schar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16(127);
+    _mm_storel_epi64((__m128i*)ptr,
+                     _mm_packs_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta));
+}
+
+inline void v_storesat_s8(schar* ptr, const v_uint16x8& a, int n)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    _mm_storel_epi64((__m128i*)ptr,
+                     _mm_packs_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta));
+}
+inline void v_storesat_s8(schar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
+
+inline void v_storesat_s8(schar* ptr, const v_int16x8& a, int n)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    _mm_storel_epi64((__m128i*)ptr,
+                     _mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta));
+}
+
+// bit-wise "mask ? a : b"
+inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
+{
+    return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
+}
+
+inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
+    __m128i r = _mm_packs_epi32(a1, b1);
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b, int n)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
+}
+inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta32 = _mm_set1_epi32(32768);
+    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
+}
+
+inline void v_storesat_u16(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, r);
+}
+inline void v_storesat_u16(ushort* ptr, const v_uint32x4& a, int n)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, r);
+}
+inline void v_storesat_u16(ushort* ptr, const v_int32x4& a)
+{
+    __m128i delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(a.val, delta32);
+    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, r);
+}
+inline void v_storesat_u16(ushort* ptr, const v_int32x4& a, int n)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, r);
+}
+
+inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(32767);
+    __m128i a1 = v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val);
+    __m128i b1 = v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val);
+    return v_int16x8(_mm_packs_epi32(a1, b1));
+}
+inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b, int n)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    return v_int16x8(_mm_packs_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n),
+                                     _mm_srli_epi32(_mm_add_epi32(b.val, delta), n)));
+}
+inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
+inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b, int n)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
+                                     _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
+}
+
+inline void v_storesat_s16(short* ptr, const v_uint32x4& a)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(32767);
+    __m128i a1 = v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
+}
+inline void v_storesat_s16(short* ptr, const v_uint32x4& a, int n)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    __m128i a1 = _mm_srli_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
+}
+inline void v_storesat_s16(short* ptr, const v_int32x4& a)
+{
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
+}
+inline void v_storesat_s16(short* ptr, const v_int32x4& a, int n)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+    __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
+}
+
+
+#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return _Tpvec(intrin(a.val, b.val)); \
+    } \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+    { \
+        a.val = intrin(a.val, b.val); \
+        return a; \
+    }
+
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
+
+inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
+}
+inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return v_int32x4(_mm_unpacklo_epi64(d0, d1));
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
+    }
+
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
+
+inline v_float32x4 v_sqrt(v_float32x4 x)
+{ return v_float32x4(_mm_sqrt_ps(x.val)); }
+inline v_float64x2 v_sqrt(v_float64x2 x)
+{ return v_float64x2(_mm_sqrt_pd(x.val)); }
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
+inline v_float64x2 v_abs(v_float64x2 x)
+{
+    return v_float64x2(_mm_and_pd(x.val,
+        _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
+
+inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i delta = _mm_set1_epi8((char)0x80);
+    return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+}
+inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i delta = _mm_set1_epi8((char)0x80);
+    return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+}
+inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
+}
+inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
+}
+inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, b.val, a.val));
+}
+inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, a.val, b.val));
+}
+inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
+}
+inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
+}
+
+#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
+inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
+{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
+{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
+} \
+inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
+} \
+inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
+} \
+inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
+} \
+inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
+}
+
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)0x80)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)0x8000)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
+
+#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, sbit) \
+inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
+} \
+inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i smask = _mm_set1_epi8(sbit); \
+    __m128i a1 = _mm_xor_si128(a.val, smask); \
+    __m128i b1 = _mm_xor_si128(b.val, smask); \
+    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
+}
+
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (char)0x80)
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (char)0x8000)
+
+#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
+inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
+    return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
+} \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
+    return _Tpvec(_mm_sqrt_##suffix(res)); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
+    return _Tpvec(res); \
+} \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ \
+    return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
+}
+
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
+
+#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(_mm_srai_##suffix(a.val, imm)); \
+}
+
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32)
+
+inline v_int16x8 v_mullo(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(_mm_mullo_epi16(a.val, b.val));
+}
+inline v_uint16x8 v_mullo(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(_mm_mullo_epi16(a.val, b.val));
+}
+inline v_int16x8 v_mulhi2(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(_mm_slli_epi16(_mm_mulhi_epi16(a.val, b.val), 1));
+}
+inline v_uint16x8 v_mulhi2(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(_mm_slli_epi16(_mm_mulhi_epu16(a.val, b.val), 1));
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                                     _mm_loadl_epi64((const __m128i*)ptr1))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_si128((__m128i*)ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_##suffix(ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_castsi128_##suffix( \
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                           _mm_loadl_epi64((const __m128i*)ptr1)))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __m128i a1 = _mm_cast##suffix##_si128(a.val); \
+    _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
+inline bool v_signmask(const _Tpvec& a) \
+{ \
+    return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
+} \
+inline bool v_check_all(const _Tpvec& a) \
+{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
+inline bool v_check_any(const _Tpvec& a) \
+{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
+
+#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
+inline __m128i v_packq_epi32(__m128i a)
+{
+    __m128i b = _mm_packs_epi32(a, a);
+    return _mm_packs_epi16(b, b);
+}
+
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
+
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
+
+#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
+inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
+{ \
+    __m128i z = _mm_setzero_si128(); \
+    b0.val = _mm_unpacklo_##suffix(a.val, z); \
+    b1.val = _mm_unpackhi_##suffix(a.val, z); \
+} \
+inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
+{ \
+    __m128i z = _mm_setzero_si128(); \
+    return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
+} \
+inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
+{ \
+    b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
+    b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
+} \
+inline _Tpwsvec v_load_expand(const _Tps* ptr) \
+{ \
+    __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
+    return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
+}
+
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    __m128i z = _mm_setzero_si128();
+    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
+    return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
+    a = _mm_unpacklo_epi8(a, a);
+    a = _mm_unpacklo_epi8(a, a);
+    return v_int32x4(_mm_srai_epi32(a, 24));
+}
+
+#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
+    b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
+    d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(_mm_cvttps_epi32(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(_mm_cvttpd_epi32(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
+    __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
+    __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
+    __m128i t3 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
+\
+    b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
+    b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
+    b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
+    b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
+}
+
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+
+#if 0
+inline void v_load_deinterleave(const uchar*, v_uint8x16&, v_uint8x16&, v_uint8x16&)
+{
+    // !!! TODO !!!
+}
+#endif
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
+
+    __m128 v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
+    __m128 v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
+    __m128 v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
+    __m128 v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
+
+    u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
+    u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
+    u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
+    u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
+
+    v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
+    v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
+    v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ...
+    v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ...
+
+    a.val = _mm_unpacklo_epi8(v0, v1);
+    b.val = _mm_unpacklo_epi8(v2, v3);
+    c.val = _mm_unpackhi_epi8(v0, v1);
+    d.val = _mm_unpacklo_epi8(v2, v3);
+}
+
+#if 0
+inline void v_load_deinterleave(const ushort*, v_uint16x8&, v_uint16x8&, v_uint16x8&)
+{
+    // !!! TODO !!!
+}
+#endif
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
+
+    __m128 v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
+    __m128 v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
+    __m128 v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
+    __m128 v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
+
+    u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
+    u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
+    u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
+    u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
+
+    a.val = _mm_unpacklo_epi16(u0, u1);
+    b.val = _mm_unpackhi_epi16(u0, u1);
+    c.val = _mm_unpacklo_epi16(u2, u3);
+    d.val = _mm_unpackhi_epi16(u2, u3);
+}
+
+#if 0
+inline void v_load_deinterleave(const unsigned*, v_uint32x4&, v_uint32x4&, v_uint32x4&)
+{
+    // !!! TODO !!!
+}
+#endif
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
+    v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
+    v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
+    v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
+
+    v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float*, v_float32x4&, v_float32x4&, v_float32x4&)
+{
+    // !!! TODO !!!
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
+{
+    v_float32x4 u0(_mm_loadu_ps(ptr));
+    v_float32x4 u1(_mm_loadu_ps(ptr + 4));
+    v_float32x4 u2(_mm_loadu_ps(ptr + 8));
+    v_float32x4 u3(_mm_loadu_ps(ptr + 12));
+
+    v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, const v_uint8x16& d)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
+    __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
+    __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+    _mm_storeu_si128((__m128i*)(ptr + 32), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                const v_uint16x8& c, const v_uint16x8& d)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
+    __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
+    __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+}
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(_mm_cvtepi32_ps(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    return v_float32x4(_mm_cvtpd_ps(a.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(_mm_cvtepi32_pd(a.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(_mm_cvtps_pd(a.val));
+}
+
+#elif CV_NEON
+
+#define CV_SIMD128 1
+
+struct v_uint8x16
+{
+    uint8x16_t val;
+};
+
+struct v_int8x16
+{
+    int8x16_t val;
+};
+
+struct v_uint16x8
+{
+    uint16x8_t val;
+};
+
+struct v_int16x8
+{
+    int16x8_t val;
+};
+
+struct v_uint32x4
+{
+    uint32x4_t val;
+};
+
+struct v_int32x4
+{
+    int32x4_t val;
+};
+
+struct v_float32x4
+{
+    float32x4_t val;
+};
+
+typedef v_reg<double, 2> v_float64x2;
+typedef v_reg<double, 4> v_float64x4;
+
+#else
+
+typedef v_reg<uchar, 16> v_uint8x16;
+typedef v_reg<schar, 16> v_int8x16;
+typedef v_reg<ushort, 8> v_uint16x8;
+typedef v_reg<short, 8> v_int16x8;
+typedef v_reg<unsigned, 4> v_uint32x4;
+typedef v_reg<int, 4> v_int32x4;
+typedef v_reg<float, 4> v_float32x4;
+typedef v_reg<float, 8> v_float32x8;
+typedef v_reg<double, 2> v_float64x2;
+typedef v_reg<double, 4> v_float64x4;
+
+inline v_uint8x16 v_setzero_u8() { return v_uint8x16::zero(); }
+inline v_int8x16 v_setzero_s8() { return v_int8x16::zero(); }
+inline v_uint16x8 v_setzero_u16() { return v_uint16x8::zero(); }
+inline v_int16x8 v_setzero_s16() { return v_int16x8::zero(); }
+inline v_uint32x4 v_setzero_u32() { return v_uint32x4::zero(); }
+inline v_int32x4 v_setzero_s32() { return v_int32x4::zero(); }
+inline v_float32x4 v_setzero_f32() { return v_float32x4::zero(); }
+inline v_float64x2 v_setzero_f64() { return v_float64x2::zero(); }
+
+inline v_uint8x16 v_setall_u8(uchar v) { return v_uint8x16::all(v); }
+inline v_int8x16 v_setall_s8(schar v) { return v_int8x16::all(v); }
+inline v_uint16x8 v_setall_u16(ushort v) { return v_uint16x8::all(v); }
+inline v_int16x8 v_setall_s16(short v) { return v_int16x8::all(v); }
+inline v_uint32x4 v_setall_u32(unsigned v) { return v_uint32x4::all(v); }
+inline v_int32x4 v_setall_s32(int v) { return v_int32x4::all(v); }
+inline v_float32x4 v_setall_f32(float v) { return v_float32x4::all(v); }
+inline v_float64x2 v_setall_f64(double v) { return v_float64x2::all(v); }
+
+template<typename _Tp, int n> inline v_uint8x16 v_reinterpret_u8(const v_reg<_Tp, n>& a)
+{ return v_reg<_Tp, n>::template reinterpret_as<uchar, 16>(a); }
+
+template<typename _Tp, int n> inline v_int8x16 v_reinterpret_s8(const v_reg<_Tp, n>& a)
+{ return v_reg<_Tp, n>::template reinterpret_as<schar, 16>(a); }
+
+template<typename _Tp, int n> inline v_uint16x8 v_reinterpret_u16(const v_reg<_Tp, n>& a)
+{ return v_reg<_Tp, n>::template reinterpret_as<ushort, 8>(a); }
+
+template<typename _Tp, int n> inline v_int16x8 v_reinterpret_s16(const v_reg<_Tp, n>& a)
+{ return v_reg<_Tp, n>::template reinterpret_as<short, 8>(a); }
+
+template<typename _Tp, int n> inline v_uint32x4 v_reinterpret_u32(const v_reg<_Tp, n>& a)
+{ return v_reg<_Tp, n>::template reinterpret_as<uint, 4>(a); }
+
+template<typename _Tp, int n> inline v_int32x4 v_reinterpret_s32(const v_reg<_Tp, n>& a)
+{ return v_reg<_Tp, n>::template reinterpret_as<int, 4>(a); }
+
+template<typename _Tp, int n> inline v_float32x4 v_reinterpret_f32(const v_reg<_Tp, n>& a)
+{ return v_reg<_Tp, n>::template reinterpret_as<float, 4>(a); }
+
+template<typename _Tp, int n> inline v_float64x2 v_reinterpret_f64(const v_reg<_Tp, n>& a)
+{ return v_reg<_Tp, n>::template reinterpret_as<double, 2>(a); }
+
+inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_cvtsat<ushort, 8, uchar>(a, b); }
+inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n)
+{ return v_cvtsat<ushort, 8, uchar>(a, b, n); }
+inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b)
+{ return v_cvtsat<short, 8, uchar>(a, b); }
+inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b, int n)
+{ return v_cvtsat<short, 8, uchar>(a, b, n); }
+
+inline void v_storesat_u8(uchar* ptr, const v_uint16x8& b)
+{ return v_storesat(ptr, b); }
+inline void v_storesat_u8(uchar* ptr, const v_uint16x8& b, int n)
+{ return v_storesat(ptr, b, n); }
+inline void v_storesat_u8(uchar* ptr, const v_int16x8& b)
+{ return v_storesat(ptr, b); }
+inline void v_storesat_u8(uchar* ptr, const v_int16x8& b, int n)
+{ return v_storesat(ptr, b, n); }
+
+inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_cvtsat<ushort, 8, schar>(a, b); }
+inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b, int n)
+{ return v_cvtsat<ushort, 8, schar>(a, b, n); }
+inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b)
+{ return v_cvtsat<short, 8, schar>(a, b); }
+inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b, int n)
+{ return v_cvtsat<short, 8, schar>(a, b, n); }
+
+inline void v_storesat_s8(schar* ptr, const v_uint16x8& b)
+{ return v_storesat(ptr, b); }
+inline void v_storesat_s8(schar* ptr, const v_uint16x8& b, int n)
+{ return v_storesat(ptr, b, n); }
+inline void v_storesat_s8(schar* ptr, const v_int16x8& b)
+{ return v_storesat(ptr, b); }
+inline void v_storesat_s8(schar* ptr, const v_int16x8& b, int n)
+{ return v_storesat(ptr, b, n); }
+
+inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_cvtsat<uint, 4, ushort>(a, b); }
+inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b, int n)
+{ return v_cvtsat<uint, 4, ushort>(a, b, n); }
+inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvtsat<int, 4, ushort>(a, b); }
+inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n)
+{ return v_cvtsat<int, 4, ushort>(a, b, n); }
+
+inline void v_storesat_u16(ushort* ptr, const v_uint32x4& b)
+{ return v_storesat(ptr, b); }
+inline void v_storesat_u16(ushort* ptr, const v_uint32x4& b, int n)
+{ return v_storesat(ptr, b, n); }
+inline void v_storesat_u16(ushort* ptr, const v_int32x4& b)
+{ return v_storesat(ptr, b); }
+inline void v_storesat_u16(ushort* ptr, const v_int32x4& b, int n)
+{ return v_storesat(ptr, b, n); }
+
+inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_cvtsat<uint, 4, short>(a, b); }
+inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b, int n)
+{ return v_cvtsat<uint, 4, short>(a, b, n); }
+inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvtsat<int, 4, short>(a, b); }
+inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b, int n)
+{ return v_cvtsat<int, 4, short>(a, b, n); }
+
+inline void v_storesat_s16(short* ptr, const v_uint32x4& b)
+{ return v_storesat(ptr, b); }
+inline void v_storesat_s16(short* ptr, const v_uint32x4& b, int n)
+{ return v_storesat(ptr, b, n); }
+inline void v_storesat_s16(short* ptr, const v_int32x4& b)
+{ return v_storesat(ptr, b); }
+inline void v_storesat_s16(short* ptr, const v_int32x4& b, int n)
+{ return v_storesat(ptr, b, n); }
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
+                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
+                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
+                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
+}
+
+#endif
+
+}}
+
+#endif
diff --git a/modules/hal/src/arithm.cpp b/modules/hal/src/arithm.cpp
new file mode 100644
index 000000000..a3f69facc
--- /dev/null
+++ b/modules/hal/src/arithm.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/color.cpp b/modules/hal/src/color.cpp
new file mode 100644
index 000000000..a3f69facc
--- /dev/null
+++ b/modules/hal/src/color.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/filter.cpp b/modules/hal/src/filter.cpp
new file mode 100644
index 000000000..a3f69facc
--- /dev/null
+++ b/modules/hal/src/filter.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/mathfuncs.cpp b/modules/hal/src/mathfuncs.cpp
new file mode 100644
index 000000000..a3f69facc
--- /dev/null
+++ b/modules/hal/src/mathfuncs.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/matrix.cpp b/modules/hal/src/matrix.cpp
new file mode 100644
index 000000000..a3f69facc
--- /dev/null
+++ b/modules/hal/src/matrix.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp
index 04ec37821..e6923fb89 100644
--- a/modules/hal/src/precomp.hpp
+++ b/modules/hal/src/precomp.hpp
@@ -1,2 +1,44 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
 
 #include "opencv2/hal.hpp"
+#include "opencv2/hal/intrin.hpp"
diff --git a/modules/hal/src/resize.cpp b/modules/hal/src/resize.cpp
new file mode 100644
index 000000000..a3f69facc
--- /dev/null
+++ b/modules/hal/src/resize.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
diff --git a/modules/hal/src/norm.cpp b/modules/hal/src/stat.cpp
similarity index 100%
rename from modules/hal/src/norm.cpp
rename to modules/hal/src/stat.cpp
diff --git a/modules/hal/src/warp.cpp b/modules/hal/src/warp.cpp
new file mode 100644
index 000000000..a3f69facc
--- /dev/null
+++ b/modules/hal/src/warp.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}

From cb00c7036f10b2b56d608113a3d1a433fab82bed Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Wed, 15 Apr 2015 15:46:07 +0300
Subject: [PATCH 13/48] Fix cache error during SVM train

---
 modules/ml/src/svm.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 05994eb91..0012a3388 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -538,6 +538,8 @@ public:
                 {
                     kr.idx = cache_size;
                     cache_size++;
+                    if (!lru_last)
+                        lru_last = i1+1;
                 }
                 else
                 {
@@ -546,6 +548,8 @@ public:
                     last.idx = -1;
                     lru_cache[last.prev].next = 0;
                     lru_last = last.prev;
+                    last.prev = 0;
+                    last.next = 0;
                 }
                 kernel->calc( sample_count, var_count, samples.ptr<float>(),
                               samples.ptr<float>(i1), lru_cache_data.ptr<Qfloat>(kr.idx) );
@@ -561,6 +565,8 @@ public:
                 else
                     lru_first = kr.next;
             }
+            if (lru_first)
+                lru_cache[lru_first].prev = i1+1;
             kr.next = lru_first;
             kr.prev = 0;
             lru_first = i1+1;

From a0bc0ab7cb7ee4eeb4e9f79cfbaace267fcf203e Mon Sep 17 00:00:00 2001
From: Nisarg Thakkar <nisargtha@gmail.com>
Date: Wed, 15 Apr 2015 23:49:01 +0530
Subject: [PATCH 14/48] Fix for Bug4243

---
 modules/imgproc/src/morph.cpp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index 44eb3adfc..78c0e4f0a 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -1820,11 +1820,22 @@ static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op,
 #endif
 
 void cv::morphologyEx( InputArray _src, OutputArray _dst, int op,
-                       InputArray kernel, Point anchor, int iterations,
+                       InputArray _kernel, Point anchor, int iterations,
                        int borderType, const Scalar& borderValue )
 {
 #ifdef HAVE_OPENCL
-    Size ksize = kernel.size();
+    Size ksize = _kernel.size();
+    Mat tempKernel;
+    if (ksize.height==0 || ksize.width==0)
+    {
+        tempKernel = getStructuringElement(MORPH_RECT, Size(3,3), Point(1,1));
+        ksize = tempKernel.size();
+    }
+    else
+    {
+        tempKernel = _kernel.getMat();
+    }
+    InputArray kernel = InputArray(tempKernel);
     anchor = normalizeAnchor(anchor, ksize);
 
     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && _src.channels() <= 4 &&

From 00094a879280f89acaad8f1d48bd8a873957e32b Mon Sep 17 00:00:00 2001
From: Nisarg Thakkar <nisargtha@gmail.com>
Date: Thu, 16 Apr 2015 00:22:08 +0530
Subject: [PATCH 15/48] Fix for Bug4243. Made changes relating to non OpenCL
 devices

---
 modules/imgproc/src/morph.cpp | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index 78c0e4f0a..f2d971bea 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -1823,19 +1823,13 @@ void cv::morphologyEx( InputArray _src, OutputArray _dst, int op,
                        InputArray _kernel, Point anchor, int iterations,
                        int borderType, const Scalar& borderValue )
 {
+    Mat kernel = _kernel.getMat();
+    if (kernel.empty())
+    {
+        kernel = getStructuringElement(MORPH_RECT, Size(3,3), Point(1,1));
+    }
 #ifdef HAVE_OPENCL
-    Size ksize = _kernel.size();
-    Mat tempKernel;
-    if (ksize.height==0 || ksize.width==0)
-    {
-        tempKernel = getStructuringElement(MORPH_RECT, Size(3,3), Point(1,1));
-        ksize = tempKernel.size();
-    }
-    else
-    {
-        tempKernel = _kernel.getMat();
-    }
-    InputArray kernel = InputArray(tempKernel);
+    Size ksize = kernel.size();
     anchor = normalizeAnchor(anchor, ksize);
 
     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && _src.channels() <= 4 &&

From 2358c79db2465ffdaf333939eba4489b1f4790d6 Mon Sep 17 00:00:00 2001
From: Ahmatnurov Dinar <ahmatnurov.d@gmail.com>
Date: Thu, 16 Apr 2015 14:36:46 +0300
Subject: [PATCH 16/48] move 3265 to 3.0;

---
 modules/imgcodecs/test/test_drawing.cpp | 78 +++++++++++++++++++++++++
 modules/imgproc/src/drawing.cpp         | 56 ++++++++++++++++--
 2 files changed, 129 insertions(+), 5 deletions(-)

diff --git a/modules/imgcodecs/test/test_drawing.cpp b/modules/imgcodecs/test/test_drawing.cpp
index f4e157fb2..d6d76822b 100644
--- a/modules/imgcodecs/test/test_drawing.cpp
+++ b/modules/imgcodecs/test/test_drawing.cpp
@@ -448,3 +448,81 @@ protected:
 };
 
 TEST(Imgcodecs_Drawing, fillconvexpoly_clipping) { CV_FillConvexPolyTest test; test.safe_run(); }
+
+class CV_DrawingTest_UTF8 : public cvtest::BaseTest
+{
+public:
+    CV_DrawingTest_UTF8() {}
+    ~CV_DrawingTest_UTF8() {}
+protected:
+    void run(int)
+    {
+        vector<string> lines;
+        lines.push_back("abcdefghijklmnopqrstuvwxyz1234567890");
+        // cyrillic letters small
+        lines.push_back("\xD0\xB0\xD0\xB1\xD0\xB2\xD0\xB3\xD0\xB4\xD0\xB5\xD1\x91\xD0\xB6\xD0\xB7"
+                        "\xD0\xB8\xD0\xB9\xD0\xBA\xD0\xBB\xD0\xBC\xD0\xBD\xD0\xBE\xD0\xBF\xD1\x80"
+                        "\xD1\x81\xD1\x82\xD1\x83\xD1\x84\xD1\x85\xD1\x86\xD1\x87\xD1\x88\xD1\x89"
+                        "\xD1\x8A\xD1\x8B\xD1\x8C\xD1\x8D\xD1\x8E\xD1\x8F");
+        // cyrillic letters capital
+        lines.push_back("\xD0\x90\xD0\x91\xD0\x92\xD0\x93\xD0\x94\xD0\x95\xD0\x81\xD0\x96\xD0\x97"
+                        "\xD0\x98\xD0\x99\xD0\x9A\xD0\x9B\xD0\x9C\xD0\x9D\xD0\x9E\xD0\x9F\xD0\xA0"
+                        "\xD0\xA1\xD0\xA2\xD0\xA3\xD0\xA4\xD0\xA5\xD0\xA6\xD0\xA7\xD0\xA8\xD0\xA9"
+                        "\xD0\xAA\xD0\xAB\xD0\xAC\xD0\xAD\xD0\xAE\xD0\xAF");
+        // bounds
+        lines.push_back("-\xD0\x80-\xD0\x8E-\xD0\x8F-");
+        lines.push_back("-\xD1\x90-\xD1\x91-\xD1\xBF-");
+        // bad utf8
+        lines.push_back("-\x81-\x82-\x83-");
+        lines.push_back("--\xF0--");
+        lines.push_back("-\xF0");
+
+        vector<int> fonts;
+        fonts.push_back(FONT_HERSHEY_SIMPLEX);
+        fonts.push_back(FONT_HERSHEY_PLAIN);
+        fonts.push_back(FONT_HERSHEY_DUPLEX);
+        fonts.push_back(FONT_HERSHEY_COMPLEX);
+        fonts.push_back(FONT_HERSHEY_TRIPLEX);
+        fonts.push_back(FONT_HERSHEY_COMPLEX_SMALL);
+        fonts.push_back(FONT_HERSHEY_SCRIPT_SIMPLEX);
+        fonts.push_back(FONT_HERSHEY_SCRIPT_COMPLEX);
+
+        vector<Mat> results;
+        Size bigSize(0, 0);
+        for (vector<int>::const_iterator font = fonts.begin(); font != fonts.end(); ++font)
+        {
+            for (int italic = 0; italic <= FONT_ITALIC; italic += FONT_ITALIC)
+            {
+                for (vector<string>::const_iterator line = lines.begin(); line != lines.end(); ++line)
+                {
+                    const float fontScale = 1;
+                    const int thickness = 1;
+                    const Scalar color(20,20,20);
+                    int baseline = 0;
+
+                    Size textSize = getTextSize(*line, *font | italic, fontScale, thickness, &baseline);
+                    Point textOrg(0, textSize.height + 2);
+                    Mat img(textSize + Size(0, baseline), CV_8UC3, Scalar(255, 255, 255));
+                    putText(img, *line, textOrg, *font | italic, fontScale, color, thickness, CV_AA);
+
+                    results.push_back(img);
+                    bigSize.width = max(bigSize.width, img.size().width);
+                    bigSize.height += img.size().height + 1;
+                }
+            }
+        }
+
+        int shift = 0;
+        Mat result(bigSize, CV_8UC3, Scalar(100, 100, 100));
+        for (vector<Mat>::const_iterator img = results.begin(); img != results.end(); ++img)
+        {
+            Rect roi(Point(0, shift), img->size());
+            Mat sub(result, roi);
+            img->copyTo(sub);
+            shift += img->size().height + 1;
+        }
+        imwrite("/tmp/all_fonts.png", result);
+    }
+};
+
+TEST(Highgui_Drawing, utf8_support) { CV_DrawingTest_UTF8 test; test.safe_run(); }
diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp
index 94aef348f..27411b247 100644
--- a/modules/imgproc/src/drawing.cpp
+++ b/modules/imgproc/src/drawing.cpp
@@ -1941,7 +1941,11 @@ static const int HersheyComplex[] = {
 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026, 2223, 2084,
 2224, 2247, 587, 2249, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111,
 2112, 2113, 2114, 2115, 2116, 2117, 2118, 2119, 2120, 2121, 2122, 2123, 2124, 2125, 2126,
-2225, 2229, 2226, 2246 };
+2225, 2229, 2226, 2246, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, 2811,
+2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826,
+2827, 2828, 2829, 2830, 2831, 2832, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909,
+2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2923, 2924,
+2925, 2926, 2927, 2928, 2929, 2930, 2931, 2932};
 
 static const int HersheyComplexItalic[] = {
 (9 + 12*16) + FONT_ITALIC_ALPHA + FONT_ITALIC_DIGIT + FONT_ITALIC_PUNCT +
@@ -2033,6 +2037,50 @@ static const int* getFontData(int fontFace)
     return ascii;
 }
 
+inline void readCheck(int &c, int &i, const String &text, int fontFace)
+{
+
+    int leftBoundary = ' ', rightBoundary = 127;
+
+    if(c >= 0x80 && fontFace == FONT_HERSHEY_COMPLEX)
+    {
+        if(c == 0xD0 && (uchar)text[i + 1] >= 0x90 && (uchar)text[i + 1] <= 0xBF)
+        {
+            c = (uchar)text[++i] - 17;
+            leftBoundary = 127;
+            rightBoundary = 175;
+        }
+        else if(c == 0xD1 && (uchar)text[i + 1] >= 0x80 && (uchar)text[i + 1] <= 0x8F)
+        {
+            c = (uchar)text[++i] + 47;
+            leftBoundary = 175;
+            rightBoundary = 191;
+        }
+        else
+        {
+            if(c >= 0xC0 && text[i+1] != 0) //2 bytes utf
+                i++;
+
+            if(c >= 0xE0 && text[i+1] != 0) //3 bytes utf
+                i++;
+
+            if(c >= 0xF0 && text[i+1] != 0) //4 bytes utf
+                i++;
+
+            if(c >= 0xF8 && text[i+1] != 0) //5 bytes utf
+                i++;
+
+            if(c >= 0xFC && text[i+1] != 0) //6 bytes utf
+                i++;
+
+            c = '?';
+        }
+    }
+
+    if(c >= rightBoundary || c < leftBoundary)
+        c = '?';
+}
+
 extern const char* g_HersheyGlyphs[];
 
 void putText( InputOutputArray _img, const String& text, Point org,
@@ -2066,8 +2114,7 @@ void putText( InputOutputArray _img, const String& text, Point org,
         int c = (uchar)text[i];
         Point p;
 
-        if( c >= 127 || c < ' ' )
-            c = '?';
+        readCheck(c, i, text, fontFace);
 
         const char* ptr = faces[ascii[(c-' ')+1]];
         p.x = (uchar)ptr[0] - 'R';
@@ -2114,8 +2161,7 @@ Size getTextSize( const String& text, int fontFace, double fontScale, int thickn
         int c = (uchar)text[i];
         Point p;
 
-        if( c >= 127 || c < ' ' )
-            c = '?';
+        readCheck(c, i, text, fontFace);
 
         const char* ptr = faces[ascii[(c-' ')+1]];
         p.x = (uchar)ptr[0] - 'R';

From bb0631a365bd09cda1045c5f4a5bac3ff6ef0f9c Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Thu, 16 Apr 2015 17:33:38 +0300
Subject: [PATCH 17/48] Add sample cmake project

---
 samples/cpp/example_cmake/CMakeLists.txt | 28 +++++++++++++
 samples/cpp/example_cmake/example.cpp    | 50 ++++++++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 samples/cpp/example_cmake/CMakeLists.txt
 create mode 100644 samples/cpp/example_cmake/example.cpp

diff --git a/samples/cpp/example_cmake/CMakeLists.txt b/samples/cpp/example_cmake/CMakeLists.txt
new file mode 100644
index 000000000..fe7e62981
--- /dev/null
+++ b/samples/cpp/example_cmake/CMakeLists.txt
@@ -0,0 +1,28 @@
+# cmake needs this line
+cmake_minimum_required(VERSION 2.8)
+
+# Define project name
+project(opencv_example_project)
+
+# Find OpenCV, you may need to set OpenCV_DIR variable
+# to the absolute path to the directory containing OpenCVConfig.cmake file
+# via the command line or GUI
+find_package(OpenCV REQUIRED)
+
+# If the package has been found, several variables will
+# be set, you can find the full list with descriptions
+# in the OpenCVConfig.cmake file.
+# Print some message showing some of them
+message(STATUS "OpenCV library status:")
+message(STATUS "    version: ${OpenCV_VERSION}")
+message(STATUS "    libraries: ${OpenCV_LIBS}")
+message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+# Add OpenCV headers location to your include paths
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+# Declare the executable target built from your sources
+add_executable(opencv_example example.cpp)
+
+# Link your application with OpenCV libraries
+target_link_libraries(opencv_example ${OpenCV_LIBS})
diff --git a/samples/cpp/example_cmake/example.cpp b/samples/cpp/example_cmake/example.cpp
new file mode 100644
index 000000000..cac5050b2
--- /dev/null
+++ b/samples/cpp/example_cmake/example.cpp
@@ -0,0 +1,50 @@
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/videoio.hpp"
+#include <iostream>
+
+using namespace cv;
+using namespace std;
+
+void drawText(Mat & image);
+
+int main()
+{
+    cout << "Built with OpenCV " << CV_VERSION << endl;
+    Mat image;
+    VideoCapture capture;
+    capture.open(0);
+    if(capture.isOpened())
+    {
+        cout << "Capture is opened" << endl;
+        for(;;)
+        {
+            capture >> image;
+            if(image.empty())
+                break;
+            drawText(image);
+            imshow("Sample", image);
+            if(waitKey(10) >= 0)
+                break;
+        }
+    }
+    else
+    {
+        cout << "No capture" << endl;
+        image = Mat::zeros(480, 640, CV_8UC1);
+        drawText(image);
+        imshow("Sample", image);
+        waitKey(0);
+    }
+    return 0;
+}
+
+void drawText(Mat & image)
+{
+    putText(image, "Hello OpenCV",
+            Point(20, 50),
+            FONT_HERSHEY_COMPLEX, 1, // font face and scale
+            Scalar(255, 255, 255), // white
+            1, LINE_AA); // line thickness and type
+}

From b80142be69dcad9aacc0de40d7c89fb11e3f232f Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Thu, 16 Apr 2015 17:34:15 +0300
Subject: [PATCH 18/48] Fix an issue in cmake tutorial

---
 .../introduction/linux_gcc_cmake/linux_gcc_cmake.markdown     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown b/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
index 4f4adbed8..fd447307a 100644
--- a/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
+++ b/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
@@ -53,9 +53,9 @@ Now you have to create your CMakeLists.txt file. It should look like this:
 cmake_minimum_required(VERSION 2.8)
 project( DisplayImage )
 find_package( OpenCV REQUIRED )
-include_directories( \f${OpenCV_INCLUDE_DIRS} )
+include_directories( ${OpenCV_INCLUDE_DIRS} )
 add_executable( DisplayImage DisplayImage.cpp )
-target_link_libraries( DisplayImage \f${OpenCV_LIBS} )
+target_link_libraries( DisplayImage ${OpenCV_LIBS} )
 @endcode
 ### Generate the executable
 

From 103336c76e4c4352193182ed441a40b89a9eb75d Mon Sep 17 00:00:00 2001
From: Deanna Hood <deanna.m.hood@gmail.com>
Date: Thu, 16 Apr 2015 11:59:05 -0400
Subject: [PATCH 19/48] Fix Bug #3989: correctly identify ellipse with its axes
 parallel to x-y axes during semi-major axis calculation

---
 modules/imgproc/src/shapedescr.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp
index 5e0c432d9..65eddfd1a 100644
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@@ -447,9 +447,9 @@ cv::RotatedRect cv::fitEllipse( InputArray _points )
     // store angle and radii
     rp[4] = -0.5 * atan2(gfp[2], gfp[1] - gfp[0]); // convert from APP angle usage
     t = sin(-2.0 * rp[4]);
-    if( fabs(t) > fabs(gfp[2])*min_eps )
+    if( fabs(t) > min_eps )
         t = gfp[2]/t;
-    else
+    else // ellipse is rotated by an integer multiple of pi/2
         t = gfp[1] - gfp[0];
     rp[2] = fabs(gfp[0] + gfp[1] - t);
     if( rp[2] > min_eps )

From ee11a2d266343e4af94875543503ad39ea2d2f4e Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 16 Apr 2015 23:00:26 +0300
Subject: [PATCH 20/48] fully implemented SSE and NEON cases of intrin.hpp;
 extended the HAL with some basic math functions

---
 modules/calib3d/test/test_fisheye.cpp         |    2 +-
 modules/core/include/opencv2/core/base.hpp    |   47 +-
 modules/core/include/opencv2/core/matx.hpp    |    2 +-
 .../core/include/opencv2/core/operations.hpp  |    4 +-
 modules/core/src/kmeans.cpp                   |    8 +-
 modules/core/src/lapack.cpp                   |  182 +-
 modules/core/src/mathfuncs.cpp                | 1489 +----------------
 modules/core/src/stat.cpp                     |  166 +-
 modules/features2d/src/kaze/AKAZEFeatures.cpp |    2 +-
 modules/hal/include/opencv2/hal.hpp           |   25 +-
 modules/hal/include/opencv2/hal/defs.h        |    4 +-
 modules/hal/include/opencv2/hal/intrin.hpp    |  901 ++++++++--
 modules/hal/src/mathfuncs.cpp                 | 1305 +++++++++++++++
 modules/hal/src/matrix.cpp                    |  161 ++
 modules/hal/src/precomp.hpp                   |    4 +
 modules/hal/src/stat.cpp                      |  154 +-
 modules/photo/src/arrays.hpp                  |    5 +
 modules/stitching/src/autocalib.cpp           |    2 +-
 18 files changed, 2460 insertions(+), 2003 deletions(-)

diff --git a/modules/calib3d/test/test_fisheye.cpp b/modules/calib3d/test/test_fisheye.cpp
index 553b81c39..d4212e94f 100644
--- a/modules/calib3d/test/test_fisheye.cpp
+++ b/modules/calib3d/test/test_fisheye.cpp
@@ -381,7 +381,7 @@ TEST_F(fisheyeTest, EtimateUncertainties)
     EXPECT_MAT_NEAR(errors.c, cv::Vec2d(0.890439368129246, 0.816096854937896), 1e-10);
     EXPECT_MAT_NEAR(errors.k, cv::Vec4d(0.00516248605191506, 0.0168181467500934, 0.0213118690274604, 0.00916010877545648), 1e-10);
     EXPECT_MAT_NEAR(err_std, cv::Vec2d(0.187475975266883, 0.185678953263995), 1e-10);
-    CV_Assert(abs(rms - 0.263782587133546) < 1e-10);
+    CV_Assert(fabs(rms - 0.263782587133546) < 1e-10);
     CV_Assert(errors.alpha == 0);
 }
 
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 83661a2fd..e4efe0fb9 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -53,6 +53,7 @@
 
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/cvstd.hpp"
+#include "opencv2/hal.hpp"
 
 namespace cv
 {
@@ -419,6 +420,12 @@ typedef Hamming HammingLUT;
 
 /////////////////////////////////// inline norms ////////////////////////////////////
 
+template<typename _Tp> inline _Tp cv_abs(_Tp x) { return std::abs(x); }
+inline int cv_abs(uchar x) { return x; }
+inline int cv_abs(schar x) { return std::abs(x); }
+inline int cv_abs(ushort x) { return x; }
+inline int cv_abs(short x) { return std::abs(x); }
+
 template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, int n)
 {
@@ -447,12 +454,12 @@ _AccTp normL1(const _Tp* a, int n)
 #if CV_ENABLE_UNROLLED
     for(; i <= n - 4; i += 4 )
     {
-        s += (_AccTp)std::abs(a[i]) + (_AccTp)std::abs(a[i+1]) +
-            (_AccTp)std::abs(a[i+2]) + (_AccTp)std::abs(a[i+3]);
+        s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) +
+            (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]);
     }
 #endif
     for( ; i < n; i++ )
-        s += std::abs(a[i]);
+        s += cv_abs(a[i]);
     return s;
 }
 
@@ -461,7 +468,7 @@ _AccTp normInf(const _Tp* a, int n)
 {
     _AccTp s = 0;
     for( int i = 0; i < n; i++ )
-        s = std::max(s, (_AccTp)std::abs(a[i]));
+        s = std::max(s, (_AccTp)cv_abs(a[i]));
     return s;
 }
 
@@ -485,11 +492,10 @@ _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
     return s;
 }
 
-template<> inline
-float normL2Sqr(const float* a, const float* b, int n)
+inline float normL2Sqr(const float* a, const float* b, int n)
 {
     if( n >= 8 )
-        return normL2Sqr_(a, b, n);
+        return hal::normL2Sqr_(a, b, n);
     float s = 0;
     for( int i = 0; i < n; i++ )
     {
@@ -519,11 +525,10 @@ _AccTp normL1(const _Tp* a, const _Tp* b, int n)
     return s;
 }
 
-template<> inline
-float normL1(const float* a, const float* b, int n)
+inline float normL1(const float* a, const float* b, int n)
 {
     if( n >= 8 )
-        return normL1_(a, b, n);
+        return hal::normL1_(a, b, n);
     float s = 0;
     for( int i = 0; i < n; i++ )
     {
@@ -533,10 +538,9 @@ float normL1(const float* a, const float* b, int n)
     return s;
 }
 
-template<> inline
-int normL1(const uchar* a, const uchar* b, int n)
+inline int normL1(const uchar* a, const uchar* b, int n)
 {
-    return normL1_(a, b, n);
+    return hal::normL1_(a, b, n);
 }
 
 template<typename _Tp, typename _AccTp> static inline
@@ -551,6 +555,23 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
     return s;
 }
 
+/** @brief Computes the cube root of an argument.
+
+ The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
+ NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
+ single-precision data.
+ @param val A function argument.
+ */
+CV_EXPORTS_W float cubeRoot(float val);
+
+/** @brief Calculates the angle of a 2D vector in degrees.
+
+ The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
+ in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
+ @param x x-coordinate of the vector.
+ @param y y-coordinate of the vector.
+ */
+CV_EXPORTS_W float fastAtan2(float y, float x);
 
 ////////////////// forward declarations for important OpenCV types //////////////////
 
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index 6cc5d0625..e9023243e 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -427,7 +427,7 @@ template<typename _Tp, int m> struct Matx_DetOp
     double operator ()(const Matx<_Tp, m, m>& a) const
     {
         Matx<_Tp, m, m> temp = a;
-        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
+        double p = hal::LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
         if( p == 0 )
             return p;
         for( int i = 0; i < m; i++ )
diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp
index bced1a755..2c42e1f3a 100644
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -72,9 +72,9 @@ template<typename _Tp, int m> struct Matx_FastInvOp
             b(i, i) = (_Tp)1;
 
         if( method == DECOMP_CHOLESKY )
-            return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+            return hal::Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
 
-        return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+        return hal::LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
     }
 };
 
diff --git a/modules/core/src/kmeans.cpp b/modules/core/src/kmeans.cpp
index cc86d2972..fe5a0cf6e 100644
--- a/modules/core/src/kmeans.cpp
+++ b/modules/core/src/kmeans.cpp
@@ -79,7 +79,7 @@ public:
 
         for ( int i = begin; i<end; i++ )
         {
-            tdist2[i] = std::min(normL2Sqr_(data + step*i, data + stepci, dims), dist[i]);
+            tdist2[i] = std::min(normL2Sqr(data + step*i, data + stepci, dims), dist[i]);
         }
     }
 
@@ -114,7 +114,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
 
     for( i = 0; i < N; i++ )
     {
-        dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
+        dist[i] = normL2Sqr(data + step*i, data + step*centers[0], dims);
         sum0 += dist[i];
     }
 
@@ -189,7 +189,7 @@ public:
             for( int k = 0; k < K; k++ )
             {
                 const float* center = centers.ptr<float>(k);
-                const double dist = normL2Sqr_(sample, center, dims);
+                const double dist = normL2Sqr(sample, center, dims);
 
                 if( min_dist > dist )
                 {
@@ -384,7 +384,7 @@ double cv::kmeans( InputArray _data, int K,
                         if( labels[i] != max_k )
                             continue;
                         sample = data.ptr<float>(i);
-                        double dist = normL2Sqr_(sample, _old_center, dims);
+                        double dist = normL2Sqr(sample, _old_center, dims);
 
                         if( max_dist <= dist )
                         {
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index a766e5f2e..dea25dd64 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -50,168 +50,6 @@
 namespace cv
 {
 
-/****************************************************************************************\
-*                     LU & Cholesky implementation for small matrices                    *
-\****************************************************************************************/
-
-template<typename _Tp> static inline int
-LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    int i, j, k, p = 1;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        k = i;
-
-        for( j = i+1; j < m; j++ )
-            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
-                k = j;
-
-        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
-            return 0;
-
-        if( k != i )
-        {
-            for( j = i; j < m; j++ )
-                std::swap(A[i*astep + j], A[k*astep + j]);
-            if( b )
-                for( j = 0; j < n; j++ )
-                    std::swap(b[i*bstep + j], b[k*bstep + j]);
-            p = -p;
-        }
-
-        _Tp d = -1/A[i*astep + i];
-
-        for( j = i+1; j < m; j++ )
-        {
-            _Tp alpha = A[j*astep + i]*d;
-
-            for( k = i+1; k < m; k++ )
-                A[j*astep + k] += alpha*A[i*astep + k];
-
-            if( b )
-                for( k = 0; k < n; k++ )
-                    b[j*bstep + k] += alpha*b[i*bstep + k];
-        }
-
-        A[i*astep + i] = -d;
-    }
-
-    if( b )
-    {
-        for( i = m-1; i >= 0; i-- )
-            for( j = 0; j < n; j++ )
-            {
-                _Tp s = b[i*bstep + j];
-                for( k = i+1; k < m; k++ )
-                    s -= A[i*astep + k]*b[k*bstep + j];
-                b[i*bstep + j] = s*A[i*astep + i];
-            }
-    }
-
-    return p;
-}
-
-
-int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-template<typename _Tp> static inline bool
-CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    _Tp* L = A;
-    int i, j, k;
-    double s;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < i; j++ )
-        {
-            s = A[i*astep + j];
-            for( k = 0; k < j; k++ )
-                s -= L[i*astep + k]*L[j*astep + k];
-            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
-        }
-        s = A[i*astep + i];
-        for( k = 0; k < j; k++ )
-        {
-            double t = L[i*astep + k];
-            s -= t*t;
-        }
-        if( s < std::numeric_limits<_Tp>::epsilon() )
-            return false;
-        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
-    }
-
-    if( !b )
-        return true;
-
-    // LLt x = b
-    // 1: L y = b
-    // 2. Lt x = y
-
-    /*
-     [ L00             ]  y0   b0
-     [ L10 L11         ]  y1 = b1
-     [ L20 L21 L22     ]  y2   b2
-     [ L30 L31 L32 L33 ]  y3   b3
-
-     [ L00 L10 L20 L30 ]  x0   y0
-     [     L11 L21 L31 ]  x1 = y1
-     [         L22 L32 ]  x2   y2
-     [             L33 ]  x3   y3
-    */
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = 0; k < i; k++ )
-                s -= L[i*astep + k]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    for( i = m-1; i >= 0; i-- )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = m-1; k > i; k-- )
-                s -= L[k*astep + i]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    return true;
-}
-
-
-bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-
 template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
 {
     a = std::abs(a);
@@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat )
             Mat a(rows, rows, CV_32F, (uchar*)buffer);
             mat.copyTo(a);
 
-            result = LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
             if( result )
             {
                 for( int i = 0; i < rows; i++ )
@@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat )
             Mat a(rows, rows, CV_64F, (uchar*)buffer);
             mat.copyTo(a);
 
-            result = LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
             if( result )
             {
                 for( int i = 0; i < rows; i++ )
@@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
     setIdentity(dst);
 
     if( method == DECOMP_LU && type == CV_32F )
-        result = LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
     else if( method == DECOMP_LU && type == CV_64F )
-        result = LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
     else if( method == DECOMP_CHOLESKY && type == CV_32F )
-        result = Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
     else
-        result = Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
 
     if( !result )
         dst = Scalar(0);
@@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
     if( method == DECOMP_LU )
     {
         if( type == CV_32F )
-            result = LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
         else
-            result = LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
     }
     else if( method == DECOMP_CHOLESKY )
     {
         if( type == CV_32F )
-            result = Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
         else
-            result = Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
     }
     else
     {
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 446b62731..e96eaeb41 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -121,107 +121,6 @@ float fastAtan2( float y, float x )
     return a;
 }
 
-static void FastAtan2_32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees=true )
-{
-    int i = 0;
-    float scale = angleInDegrees ? 1 : (float)(CV_PI/180);
-
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale))
-        return;
-#endif
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        Cv32suf iabsmask; iabsmask.i = 0x7fffffff;
-        __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f);
-        __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f);
-        __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale);
-        __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3);
-        __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7);
-
-        for( ; i <= len - 4; i += 4 )
-        {
-            __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i);
-            __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask);
-            __m128 mask = _mm_cmplt_ps(ax, ay);
-            __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay);
-            __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps));
-            __m128 c2 = _mm_mul_ps(c, c);
-            __m128 a = _mm_mul_ps(c2, p7);
-            a = _mm_mul_ps(_mm_add_ps(a, p5), c2);
-            a = _mm_mul_ps(_mm_add_ps(a, p3), c2);
-            a = _mm_mul_ps(_mm_add_ps(a, p1), c);
-
-            __m128 b = _mm_sub_ps(_90, a);
-            a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-            b = _mm_sub_ps(_180, a);
-            mask = _mm_cmplt_ps(x, z);
-            a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-            b = _mm_sub_ps(_360, a);
-            mask = _mm_cmplt_ps(y, z);
-            a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-            a = _mm_mul_ps(a, scale4);
-            _mm_storeu_ps(angle + i, a);
-        }
-    }
-#elif CV_NEON
-    float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON);
-    float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f);
-    float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale);
-    float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3);
-    float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7);
-
-    for( ; i <= len - 4; i += 4 )
-    {
-        float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i);
-        float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y);
-        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);
-        float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps)));
-        float32x4_t c2 = vmulq_f32(c, c);
-        float32x4_t a = vmulq_f32(c2, p7);
-        a = vmulq_f32(vaddq_f32(a, p5), c2);
-        a = vmulq_f32(vaddq_f32(a, p3), c2);
-        a = vmulq_f32(vaddq_f32(a, p1), c);
-
-        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));
-        a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a);
-        a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a);
-
-        vst1q_f32(angle + i, vmulq_f32(a, scale4));
-    }
-#endif
-
-    for( ; i < len; i++ )
-    {
-        float x = X[i], y = Y[i];
-        float ax = std::abs(x), ay = std::abs(y);
-        float a, c, c2;
-        if( ax >= ay )
-        {
-            c = ay/(ax + (float)DBL_EPSILON);
-            c2 = c*c;
-            a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-        }
-        else
-        {
-            c = ax/(ay + (float)DBL_EPSILON);
-            c2 = c*c;
-            a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-        }
-        if( x < 0 )
-            a = 180.f - a;
-        if( y < 0 )
-            a = 360.f - a;
-        angle[i] = (float)(a*scale);
-    }
-}
-
-
 /* ************************************************************************** *\
    Fast cube root by Ken Turkowski
    (http://www.worldserver.com/turk/computergraphics/papers.html)
@@ -263,255 +162,6 @@ float  cubeRoot( float value )
     return v.f;
 }
 
-static void Magnitude_32f(const float* x, const float* y, float* mag, int len)
-{
-#if defined HAVE_IPP && 0
-    CV_IPP_CHECK()
-    {
-        IppStatus status = ippsMagnitude_32f(x, y, mag, len);
-        if (status >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-
-    int i = 0;
-
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        for( ; i <= len - 8; i += 8 )
-        {
-            __m128 x0 = _mm_loadu_ps(x + i), x1 = _mm_loadu_ps(x + i + 4);
-            __m128 y0 = _mm_loadu_ps(y + i), y1 = _mm_loadu_ps(y + i + 4);
-            x0 = _mm_add_ps(_mm_mul_ps(x0, x0), _mm_mul_ps(y0, y0));
-            x1 = _mm_add_ps(_mm_mul_ps(x1, x1), _mm_mul_ps(y1, y1));
-            x0 = _mm_sqrt_ps(x0); x1 = _mm_sqrt_ps(x1);
-            _mm_storeu_ps(mag + i, x0); _mm_storeu_ps(mag + i + 4, x1);
-        }
-    }
-#elif CV_NEON
-    for( ; i <= len - 4; i += 4 )
-    {
-        float32x4_t v_x = vld1q_f32(x + i), v_y = vld1q_f32(y + i);
-        vst1q_f32(mag + i, cv_vsqrtq_f32(vmlaq_f32(vmulq_f32(v_x, v_x), v_y, v_y)));
-    }
-    for( ; i <= len - 2; i += 2 )
-    {
-        float32x2_t v_x = vld1_f32(x + i), v_y = vld1_f32(y + i);
-        vst1_f32(mag + i, cv_vsqrt_f32(vmla_f32(vmul_f32(v_x, v_x), v_y, v_y)));
-    }
-#endif
-
-    for( ; i < len; i++ )
-    {
-        float x0 = x[i], y0 = y[i];
-        mag[i] = std::sqrt(x0*x0 + y0*y0);
-    }
-}
-
-static void Magnitude_64f(const double* x, const double* y, double* mag, int len)
-{
-#if defined(HAVE_IPP)
-    CV_IPP_CHECK()
-    {
-        IppStatus status = ippsMagnitude_64f(x, y, mag, len);
-        if (status >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-
-    int i = 0;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        for( ; i <= len - 4; i += 4 )
-        {
-            __m128d x0 = _mm_loadu_pd(x + i), x1 = _mm_loadu_pd(x + i + 2);
-            __m128d y0 = _mm_loadu_pd(y + i), y1 = _mm_loadu_pd(y + i + 2);
-            x0 = _mm_add_pd(_mm_mul_pd(x0, x0), _mm_mul_pd(y0, y0));
-            x1 = _mm_add_pd(_mm_mul_pd(x1, x1), _mm_mul_pd(y1, y1));
-            x0 = _mm_sqrt_pd(x0); x1 = _mm_sqrt_pd(x1);
-            _mm_storeu_pd(mag + i, x0); _mm_storeu_pd(mag + i + 2, x1);
-        }
-    }
-#endif
-
-    for( ; i < len; i++ )
-    {
-        double x0 = x[i], y0 = y[i];
-        mag[i] = std::sqrt(x0*x0 + y0*y0);
-    }
-}
-
-
-static void InvSqrt_32f(const float* src, float* dst, int len)
-{
-#if defined(HAVE_IPP)
-    CV_IPP_CHECK()
-    {
-        if (ippsInvSqrt_32f_A21(src, dst, len) >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-
-    int i = 0;
-
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
-        if( (((size_t)src|(size_t)dst) & 15) == 0 )
-            for( ; i <= len - 8; i += 8 )
-            {
-                __m128 t0 = _mm_load_ps(src + i), t1 = _mm_load_ps(src + i + 4);
-                __m128 h0 = _mm_mul_ps(t0, _0_5), h1 = _mm_mul_ps(t1, _0_5);
-                t0 = _mm_rsqrt_ps(t0); t1 = _mm_rsqrt_ps(t1);
-                t0 = _mm_mul_ps(t0, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t0,t0),h0)));
-                t1 = _mm_mul_ps(t1, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t1,t1),h1)));
-                _mm_store_ps(dst + i, t0); _mm_store_ps(dst + i + 4, t1);
-            }
-        else
-            for( ; i <= len - 8; i += 8 )
-            {
-                __m128 t0 = _mm_loadu_ps(src + i), t1 = _mm_loadu_ps(src + i + 4);
-                __m128 h0 = _mm_mul_ps(t0, _0_5), h1 = _mm_mul_ps(t1, _0_5);
-                t0 = _mm_rsqrt_ps(t0); t1 = _mm_rsqrt_ps(t1);
-                t0 = _mm_mul_ps(t0, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t0,t0),h0)));
-                t1 = _mm_mul_ps(t1, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t1,t1),h1)));
-                _mm_storeu_ps(dst + i, t0); _mm_storeu_ps(dst + i + 4, t1);
-            }
-    }
-#elif CV_NEON
-    for ( ; i <= len - 8; i += 8)
-    {
-        vst1q_f32(dst + i, cv_vrsqrtq_f32(vld1q_f32(src + i)));
-        vst1q_f32(dst + i + 4, cv_vrsqrtq_f32(vld1q_f32(src + i + 4)));
-    }
-#endif
-
-    for( ; i < len; i++ )
-        dst[i] = 1/std::sqrt(src[i]);
-}
-
-
-static void InvSqrt_64f(const double* src, double* dst, int len)
-{
-    int i = 0;
-
-#if CV_SSE2
-    if (USE_SSE2)
-    {
-        __m128d v_1 = _mm_set1_pd(1.0);
-        for ( ; i <= len - 2; i += 2)
-            _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
-    }
-#endif
-
-    for( ; i < len; i++ )
-        dst[i] = 1/std::sqrt(src[i]);
-}
-
-
-static void Sqrt_32f(const float* src, float* dst, int len)
-{
-#if defined(HAVE_IPP)
-    CV_IPP_CHECK()
-    {
-        if (ippsSqrt_32f_A21(src, dst, len) >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    int i = 0;
-
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        if( (((size_t)src|(size_t)dst) & 15) == 0 )
-            for( ; i <= len - 8; i += 8 )
-            {
-                __m128 t0 = _mm_load_ps(src + i), t1 = _mm_load_ps(src + i + 4);
-                t0 = _mm_sqrt_ps(t0); t1 = _mm_sqrt_ps(t1);
-                _mm_store_ps(dst + i, t0); _mm_store_ps(dst + i + 4, t1);
-            }
-        else
-            for( ; i <= len - 8; i += 8 )
-            {
-                __m128 t0 = _mm_loadu_ps(src + i), t1 = _mm_loadu_ps(src + i + 4);
-                t0 = _mm_sqrt_ps(t0); t1 = _mm_sqrt_ps(t1);
-                _mm_storeu_ps(dst + i, t0); _mm_storeu_ps(dst + i + 4, t1);
-            }
-    }
-#elif CV_NEON
-    for ( ; i <= len - 8; i += 8)
-    {
-        vst1q_f32(dst + i, cv_vsqrtq_f32(vld1q_f32(src + i)));
-        vst1q_f32(dst + i + 4, cv_vsqrtq_f32(vld1q_f32(src + i + 4)));
-    }
-#endif
-
-    for( ; i < len; i++ )
-        dst[i] = std::sqrt(src[i]);
-}
-
-
-static void Sqrt_64f(const double* src, double* dst, int len)
-{
-#if defined(HAVE_IPP)
-    CV_IPP_CHECK()
-    {
-        if (ippsSqrt_64f_A50(src, dst, len) >= 0)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-
-    int i = 0;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        if( (((size_t)src|(size_t)dst) & 15) == 0 )
-            for( ; i <= len - 4; i += 4 )
-            {
-                __m128d t0 = _mm_load_pd(src + i), t1 = _mm_load_pd(src + i + 2);
-                t0 = _mm_sqrt_pd(t0); t1 = _mm_sqrt_pd(t1);
-                _mm_store_pd(dst + i, t0); _mm_store_pd(dst + i + 2, t1);
-            }
-        else
-            for( ; i <= len - 4; i += 4 )
-            {
-                __m128d t0 = _mm_loadu_pd(src + i), t1 = _mm_loadu_pd(src + i + 2);
-                t0 = _mm_sqrt_pd(t0); t1 = _mm_sqrt_pd(t1);
-                _mm_storeu_pd(dst + i, t0); _mm_storeu_pd(dst + i + 2, t1);
-            }
-    }
-#endif
-
-    for( ; i < len; i++ )
-        dst[i] = std::sqrt(src[i]);
-}
-
-
 /****************************************************************************************\
 *                                  Cartezian -> Polar                                    *
 \****************************************************************************************/
@@ -539,13 +189,13 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
         {
             const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
             float *mag = (float*)ptrs[2];
-            Magnitude_32f( x, y, mag, len );
+            hal::magnitude( x, y, mag, len );
         }
         else
         {
             const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
             double *mag = (double*)ptrs[2];
-            Magnitude_64f( x, y, mag, len );
+            hal::magnitude( x, y, mag, len );
         }
     }
 }
@@ -588,7 +238,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
             {
                 const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                 float *angle = (float*)ptrs[2];
-                FastAtan2_32f( y, x, angle, len, angleInDegrees );
+                hal::fastAtan2( y, x, angle, len, angleInDegrees );
             }
             else
             {
@@ -618,7 +268,7 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
                     buf[1][k] = (float)y[k];
                 }
 
-                FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
+                hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees );
                 k = 0;
 
 #if CV_SSE2
@@ -722,15 +372,15 @@ void cartToPolar( InputArray src1, InputArray src2,
             {
                 const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                 float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3];
-                Magnitude_32f( x, y, mag, len );
-                FastAtan2_32f( y, x, angle, len, angleInDegrees );
+                hal::magnitude( x, y, mag, len );
+                hal::fastAtan2( y, x, angle, len, angleInDegrees );
             }
             else
             {
                 const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
                 double *angle = (double*)ptrs[3];
 
-                Magnitude_64f(x, y, (double*)ptrs[2], len);
+                hal::magnitude(x, y, (double*)ptrs[2], len);
                 k = 0;
 
 #if CV_SSE2
@@ -755,7 +405,7 @@ void cartToPolar( InputArray src1, InputArray src2,
                     buf[1][k] = (float)y[k];
                 }
 
-                FastAtan2_32f( buf[1], buf[0], buf[0], len, angleInDegrees );
+                hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees );
                 k = 0;
 
 #if CV_SSE2
@@ -1096,482 +746,6 @@ void polarToCart( InputArray src1, InputArray src2,
 *                                          E X P                                         *
 \****************************************************************************************/
 
-typedef union
-{
-    struct {
-#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ )
-        int hi;
-        int lo;
-#else
-        int lo;
-        int hi;
-#endif
-    } i;
-    double d;
-}
-DBLINT;
-
-#define EXPTAB_SCALE 6
-#define EXPTAB_MASK  ((1 << EXPTAB_SCALE) - 1)
-
-#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2
-
-static const double expTab[] = {
-    1.0 * EXPPOLY_32F_A0,
-    1.0108892860517004600204097905619 * EXPPOLY_32F_A0,
-    1.0218971486541166782344801347833 * EXPPOLY_32F_A0,
-    1.0330248790212284225001082839705 * EXPPOLY_32F_A0,
-    1.0442737824274138403219664787399 * EXPPOLY_32F_A0,
-    1.0556451783605571588083413251529 * EXPPOLY_32F_A0,
-    1.0671404006768236181695211209928 * EXPPOLY_32F_A0,
-    1.0787607977571197937406800374385 * EXPPOLY_32F_A0,
-    1.0905077326652576592070106557607 * EXPPOLY_32F_A0,
-    1.1023825833078409435564142094256 * EXPPOLY_32F_A0,
-    1.1143867425958925363088129569196 * EXPPOLY_32F_A0,
-    1.126521618608241899794798643787 * EXPPOLY_32F_A0,
-    1.1387886347566916537038302838415 * EXPPOLY_32F_A0,
-    1.151189229952982705817759635202 * EXPPOLY_32F_A0,
-    1.1637248587775775138135735990922 * EXPPOLY_32F_A0,
-    1.1763969916502812762846457284838 * EXPPOLY_32F_A0,
-    1.1892071150027210667174999705605 * EXPPOLY_32F_A0,
-    1.2021567314527031420963969574978 * EXPPOLY_32F_A0,
-    1.2152473599804688781165202513388 * EXPPOLY_32F_A0,
-    1.2284805361068700056940089577928 * EXPPOLY_32F_A0,
-    1.2418578120734840485936774687266 * EXPPOLY_32F_A0,
-    1.2553807570246910895793906574423 * EXPPOLY_32F_A0,
-    1.2690509571917332225544190810323 * EXPPOLY_32F_A0,
-    1.2828700160787782807266697810215 * EXPPOLY_32F_A0,
-    1.2968395546510096659337541177925 * EXPPOLY_32F_A0,
-    1.3109612115247643419229917863308 * EXPPOLY_32F_A0,
-    1.3252366431597412946295370954987 * EXPPOLY_32F_A0,
-    1.3396675240533030053600306697244 * EXPPOLY_32F_A0,
-    1.3542555469368927282980147401407 * EXPPOLY_32F_A0,
-    1.3690024229745906119296011329822 * EXPPOLY_32F_A0,
-    1.3839098819638319548726595272652 * EXPPOLY_32F_A0,
-    1.3989796725383111402095281367152 * EXPPOLY_32F_A0,
-    1.4142135623730950488016887242097 * EXPPOLY_32F_A0,
-    1.4296133383919700112350657782751 * EXPPOLY_32F_A0,
-    1.4451808069770466200370062414717 * EXPPOLY_32F_A0,
-    1.4609177941806469886513028903106 * EXPPOLY_32F_A0,
-    1.476826145939499311386907480374 * EXPPOLY_32F_A0,
-    1.4929077282912648492006435314867 * EXPPOLY_32F_A0,
-    1.5091644275934227397660195510332 * EXPPOLY_32F_A0,
-    1.5255981507445383068512536895169 * EXPPOLY_32F_A0,
-    1.5422108254079408236122918620907 * EXPPOLY_32F_A0,
-    1.5590044002378369670337280894749 * EXPPOLY_32F_A0,
-    1.5759808451078864864552701601819 * EXPPOLY_32F_A0,
-    1.5931421513422668979372486431191 * EXPPOLY_32F_A0,
-    1.6104903319492543081795206673574 * EXPPOLY_32F_A0,
-    1.628027421857347766848218522014 * EXPPOLY_32F_A0,
-    1.6457554781539648445187567247258 * EXPPOLY_32F_A0,
-    1.6636765803267364350463364569764 * EXPPOLY_32F_A0,
-    1.6817928305074290860622509524664 * EXPPOLY_32F_A0,
-    1.7001063537185234695013625734975 * EXPPOLY_32F_A0,
-    1.7186192981224779156293443764563 * EXPPOLY_32F_A0,
-    1.7373338352737062489942020818722 * EXPPOLY_32F_A0,
-    1.7562521603732994831121606193753 * EXPPOLY_32F_A0,
-    1.7753764925265212525505592001993 * EXPPOLY_32F_A0,
-    1.7947090750031071864277032421278 * EXPPOLY_32F_A0,
-    1.8142521755003987562498346003623 * EXPPOLY_32F_A0,
-    1.8340080864093424634870831895883 * EXPPOLY_32F_A0,
-    1.8539791250833855683924530703377 * EXPPOLY_32F_A0,
-    1.8741676341102999013299989499544 * EXPPOLY_32F_A0,
-    1.8945759815869656413402186534269 * EXPPOLY_32F_A0,
-    1.9152065613971472938726112702958 * EXPPOLY_32F_A0,
-    1.9360617934922944505980559045667 * EXPPOLY_32F_A0,
-    1.9571441241754002690183222516269 * EXPPOLY_32F_A0,
-    1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
-};
-
-
-// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
-#if (defined _MSC_VER && _MSC_VER < 1500) || \
-    (!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402)
-#undef CV_SSE2
-#define CV_SSE2 0
-#endif
-
-static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);
-static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
-static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000
-
-static void Exp_32f( const float *_x, float *y, int n )
-{
-    static const float
-        A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
-        A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
-        A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
-        A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
-
-#undef EXPPOLY
-#define EXPPOLY(x)  \
-    (((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
-
-    int i = 0;
-    const Cv32suf* x = (const Cv32suf*)_x;
-    Cv32suf buf[4];
-
-#if CV_SSE2
-    if( n >= 8 && USE_SSE2 )
-    {
-        static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
-        static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
-        static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
-        static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
-
-        static const __m128 mA1 = _mm_set1_ps(A1);
-        static const __m128 mA2 = _mm_set1_ps(A2);
-        static const __m128 mA3 = _mm_set1_ps(A3);
-        static const __m128 mA4 = _mm_set1_ps(A4);
-        bool y_aligned = (size_t)(void*)y % 16 == 0;
-
-        ushort CV_DECL_ALIGNED(16) tab_idx[8];
-
-        for( ; i <= n - 8; i += 8 )
-        {
-            __m128 xf0, xf1;
-            xf0 = _mm_loadu_ps(&x[i].f);
-            xf1 = _mm_loadu_ps(&x[i+4].f);
-            __m128i xi0, xi1, xi2, xi3;
-
-            xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
-            xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
-
-            __m128d xd0 = _mm_cvtps_pd(xf0);
-            __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
-            __m128d xd1 = _mm_cvtps_pd(xf1);
-            __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
-
-            xd0 = _mm_mul_pd(xd0, prescale2);
-            xd2 = _mm_mul_pd(xd2, prescale2);
-            xd1 = _mm_mul_pd(xd1, prescale2);
-            xd3 = _mm_mul_pd(xd3, prescale2);
-
-            xi0 = _mm_cvtpd_epi32(xd0);
-            xi2 = _mm_cvtpd_epi32(xd2);
-
-            xi1 = _mm_cvtpd_epi32(xd1);
-            xi3 = _mm_cvtpd_epi32(xd3);
-
-            xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
-            xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
-            xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
-            xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
-
-            xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
-            xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
-
-            xf0 = _mm_mul_ps(xf0, postscale4);
-            xf1 = _mm_mul_ps(xf1, postscale4);
-
-            xi0 = _mm_unpacklo_epi64(xi0, xi2);
-            xi1 = _mm_unpacklo_epi64(xi1, xi3);
-            xi0 = _mm_packs_epi32(xi0, xi1);
-
-            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
-
-            xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
-            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
-            xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
-
-            __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
-            __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
-            __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
-            __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
-
-            __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
-            __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
-
-            yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
-            yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
-
-            __m128 zf0 = _mm_add_ps(xf0, mA1);
-            __m128 zf1 = _mm_add_ps(xf1, mA1);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
-
-            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
-            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
-
-            zf0 = _mm_mul_ps(zf0, yf0);
-            zf1 = _mm_mul_ps(zf1, yf1);
-
-            if( y_aligned )
-            {
-                _mm_store_ps(y + i, zf0);
-                _mm_store_ps(y + i + 4, zf1);
-            }
-            else
-            {
-                _mm_storeu_ps(y + i, zf0);
-                _mm_storeu_ps(y + i + 4, zf1);
-            }
-        }
-    }
-    else
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0 = x[i].f * exp_prescale;
-        double x1 = x[i + 1].f * exp_prescale;
-        double x2 = x[i + 2].f * exp_prescale;
-        double x3 = x[i + 3].f * exp_prescale;
-        int val0, val1, val2, val3, t;
-
-        if( ((x[i].i >> 23) & 255) > 127 + 10 )
-            x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
-
-        if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
-            x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
-
-        if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
-            x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
-
-        if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
-            x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
-
-        val0 = cvRound(x0);
-        val1 = cvRound(x1);
-        val2 = cvRound(x2);
-        val3 = cvRound(x3);
-
-        x0 = (x0 - val0)*exp_postscale;
-        x1 = (x1 - val1)*exp_postscale;
-        x2 = (x2 - val2)*exp_postscale;
-        x3 = (x3 - val3)*exp_postscale;
-
-        t = (val0 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-        buf[0].i = t << 23;
-
-        t = (val1 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-        buf[1].i = t << 23;
-
-        t = (val2 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-        buf[2].i = t << 23;
-
-        t = (val3 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-        buf[3].i = t << 23;
-
-        x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-        x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
-
-        y[i] = (float)x0;
-        y[i + 1] = (float)x1;
-
-        x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
-        x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
-
-        y[i + 2] = (float)x2;
-        y[i + 3] = (float)x3;
-    }
-
-    for( ; i < n; i++ )
-    {
-        double x0 = x[i].f * exp_prescale;
-        int val0, t;
-
-        if( ((x[i].i >> 23) & 255) > 127 + 10 )
-            x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
-
-        val0 = cvRound(x0);
-        t = (val0 >> EXPTAB_SCALE) + 127;
-        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
-
-        buf[0].i = t << 23;
-        x0 = (x0 - val0)*exp_postscale;
-
-        y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
-    }
-}
-
-
-static void Exp_64f( const double *_x, double *y, int n )
-{
-    static const double
-    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
-    A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
-    A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
-    A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
-    A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
-    A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
-
-#undef EXPPOLY
-#define EXPPOLY(x)  (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
-
-    int i = 0;
-    Cv64suf buf[4];
-    const Cv64suf* x = (const Cv64suf*)_x;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
-        static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
-        static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
-        static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
-
-        static const __m128d mA0 = _mm_set1_pd(A0);
-        static const __m128d mA1 = _mm_set1_pd(A1);
-        static const __m128d mA2 = _mm_set1_pd(A2);
-        static const __m128d mA3 = _mm_set1_pd(A3);
-        static const __m128d mA4 = _mm_set1_pd(A4);
-        static const __m128d mA5 = _mm_set1_pd(A5);
-
-        int CV_DECL_ALIGNED(16) tab_idx[4];
-
-        for( ; i <= n - 4; i += 4 )
-        {
-            __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
-            __m128i xi0, xi1;
-            xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
-            xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
-            xf0 = _mm_mul_pd(xf0, prescale2);
-            xf1 = _mm_mul_pd(xf1, prescale2);
-
-            xi0 = _mm_cvtpd_epi32(xf0);
-            xi1 = _mm_cvtpd_epi32(xf1);
-            xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
-            xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
-
-            xi0 = _mm_unpacklo_epi64(xi0, xi1);
-            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
-
-            xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
-            xi0 = _mm_packs_epi32(xi0, xi0);
-            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
-            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
-            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
-            xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
-            xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
-
-            __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
-            __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
-            yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
-            yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
-
-            __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
-            __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
-
-            zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
-            zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
-
-            zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
-            zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
-
-            zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
-            zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
-
-            zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
-            zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
-
-            zf0 = _mm_mul_pd(zf0, yf0);
-            zf1 = _mm_mul_pd(zf1, yf1);
-
-            _mm_storeu_pd(y + i, zf0);
-            _mm_storeu_pd(y + i + 2, zf1);
-        }
-    }
-    else
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0 = x[i].f * exp_prescale;
-        double x1 = x[i + 1].f * exp_prescale;
-        double x2 = x[i + 2].f * exp_prescale;
-        double x3 = x[i + 3].f * exp_prescale;
-
-        double y0, y1, y2, y3;
-        int val0, val1, val2, val3, t;
-
-        t = (int)(x[i].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x0 = t < 0 ? -exp_max_val : exp_max_val;
-
-        t = (int)(x[i+1].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x1 = t < 0 ? -exp_max_val : exp_max_val;
-
-        t = (int)(x[i+2].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x2 = t < 0 ? -exp_max_val : exp_max_val;
-
-        t = (int)(x[i+3].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x3 = t < 0 ? -exp_max_val : exp_max_val;
-
-        val0 = cvRound(x0);
-        val1 = cvRound(x1);
-        val2 = cvRound(x2);
-        val3 = cvRound(x3);
-
-        x0 = (x0 - val0)*exp_postscale;
-        x1 = (x1 - val1)*exp_postscale;
-        x2 = (x2 - val2)*exp_postscale;
-        x3 = (x3 - val3)*exp_postscale;
-
-        t = (val0 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[0].i = (int64)t << 52;
-
-        t = (val1 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[1].i = (int64)t << 52;
-
-        t = (val2 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[2].i = (int64)t << 52;
-
-        t = (val3 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-        buf[3].i = (int64)t << 52;
-
-        y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-        y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
-
-        y[i] = y0;
-        y[i + 1] = y1;
-
-        y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
-        y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
-
-        y[i + 2] = y2;
-        y[i + 3] = y3;
-    }
-
-    for( ; i < n; i++ )
-    {
-        double x0 = x[i].f * exp_prescale;
-        int val0, t;
-
-        t = (int)(x[i].i >> 52);
-        if( (t & 2047) > 1023 + 10 )
-            x0 = t < 0 ? -exp_max_val : exp_max_val;
-
-        val0 = cvRound(x0);
-        t = (val0 >> EXPTAB_SCALE) + 1023;
-        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
-
-        buf[0].i = (int64)t << 52;
-        x0 = (x0 - val0)*exp_postscale;
-
-        y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
-    }
-}
-
-#undef EXPTAB_SCALE
-#undef EXPTAB_MASK
-#undef EXPPOLY_32F_A0
-
 #ifdef HAVE_IPP
 static void Exp_32f_ipp(const float *x, float *y, int n)
 {
@@ -1584,7 +758,7 @@ static void Exp_32f_ipp(const float *x, float *y, int n)
         }
         setIppErrorStatus();
     }
-    Exp_32f(x, y, n);
+    hal::exp(x, y, n);
 }
 
 static void Exp_64f_ipp(const double *x, double *y, int n)
@@ -1598,11 +772,14 @@ static void Exp_64f_ipp(const double *x, double *y, int n)
         }
         setIppErrorStatus();
     }
-    Exp_64f(x, y, n);
+    hal::exp(x, y, n);
 }
 
 #define Exp_32f Exp_32f_ipp
 #define Exp_64f Exp_64f_ipp
+#else
+#define Exp_32f hal::exp
+#define Exp_64f hal::exp
 #endif
 
 
@@ -1637,613 +814,6 @@ void exp( InputArray _src, OutputArray _dst )
 *                                          L O G                                         *
 \****************************************************************************************/
 
-#define LOGTAB_SCALE    8
-#define LOGTAB_MASK         ((1 << LOGTAB_SCALE) - 1)
-#define LOGTAB_MASK2        ((1 << (20 - LOGTAB_SCALE)) - 1)
-#define LOGTAB_MASK2_32F    ((1 << (23 - LOGTAB_SCALE)) - 1)
-
-static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
-0.0000000000000000000000000000000000000000,    1.000000000000000000000000000000000000000,
-.00389864041565732288852075271279318258166,    .9961089494163424124513618677042801556420,
-.00778214044205494809292034119607706088573,    .9922480620155038759689922480620155038760,
-.01165061721997527263705585198749759001657,    .9884169884169884169884169884169884169884,
-.01550418653596525274396267235488267033361,    .9846153846153846153846153846153846153846,
-.01934296284313093139406447562578250654042,    .9808429118773946360153256704980842911877,
-.02316705928153437593630670221500622574241,    .9770992366412213740458015267175572519084,
-.02697658769820207233514075539915211265906,    .9733840304182509505703422053231939163498,
-.03077165866675368732785500469617545604706,    .9696969696969696969696969696969696969697,
-.03455238150665972812758397481047722976656,    .9660377358490566037735849056603773584906,
-.03831886430213659461285757856785494368522,    .9624060150375939849624060150375939849624,
-.04207121392068705056921373852674150839447,    .9588014981273408239700374531835205992509,
-.04580953603129420126371940114040626212953,    .9552238805970149253731343283582089552239,
-.04953393512227662748292900118940451648088,    .9516728624535315985130111524163568773234,
-.05324451451881227759255210685296333394944,    .9481481481481481481481481481481481481481,
-.05694137640013842427411105973078520037234,    .9446494464944649446494464944649446494465,
-.06062462181643483993820353816772694699466,    .9411764705882352941176470588235294117647,
-.06429435070539725460836422143984236754475,    .9377289377289377289377289377289377289377,
-.06795066190850773679699159401934593915938,    .9343065693430656934306569343065693430657,
-.07159365318700880442825962290953611955044,    .9309090909090909090909090909090909090909,
-.07522342123758751775142172846244648098944,    .9275362318840579710144927536231884057971,
-.07884006170777602129362549021607264876369,    .9241877256317689530685920577617328519856,
-.08244366921107458556772229485432035289706,    .9208633093525179856115107913669064748201,
-.08603433734180314373940490213499288074675,    .9175627240143369175627240143369175627240,
-.08961215868968712416897659522874164395031,    .9142857142857142857142857142857142857143,
-.09317722485418328259854092721070628613231,    .9110320284697508896797153024911032028470,
-.09672962645855109897752299730200320482256,    .9078014184397163120567375886524822695035,
-.10026945316367513738597949668474029749630,    .9045936395759717314487632508833922261484,
-.10379679368164355934833764649738441221420,    .9014084507042253521126760563380281690141,
-.10731173578908805021914218968959175981580,    .8982456140350877192982456140350877192982,
-.11081436634029011301105782649756292812530,    .8951048951048951048951048951048951048951,
-.11430477128005862852422325204315711744130,    .8919860627177700348432055749128919860627,
-.11778303565638344185817487641543266363440,    .8888888888888888888888888888888888888889,
-.12124924363286967987640707633545389398930,    .8858131487889273356401384083044982698962,
-.12470347850095722663787967121606925502420,    .8827586206896551724137931034482758620690,
-.12814582269193003360996385708858724683530,    .8797250859106529209621993127147766323024,
-.13157635778871926146571524895989568904040,    .8767123287671232876712328767123287671233,
-.13499516453750481925766280255629681050780,    .8737201365187713310580204778156996587031,
-.13840232285911913123754857224412262439730,    .8707482993197278911564625850340136054422,
-.14179791186025733629172407290752744302150,    .8677966101694915254237288135593220338983,
-.14518200984449788903951628071808954700830,    .8648648648648648648648648648648648648649,
-.14855469432313711530824207329715136438610,    .8619528619528619528619528619528619528620,
-.15191604202584196858794030049466527998450,    .8590604026845637583892617449664429530201,
-.15526612891112392955683674244937719777230,    .8561872909698996655518394648829431438127,
-.15860503017663857283636730244325008243330,    .8533333333333333333333333333333333333333,
-.16193282026931324346641360989451641216880,    .8504983388704318936877076411960132890365,
-.16524957289530714521497145597095368430010,    .8476821192052980132450331125827814569536,
-.16855536102980664403538924034364754334090,    .8448844884488448844884488448844884488449,
-.17185025692665920060697715143760433420540,    .8421052631578947368421052631578947368421,
-.17513433212784912385018287750426679849630,    .8393442622950819672131147540983606557377,
-.17840765747281828179637841458315961062910,    .8366013071895424836601307189542483660131,
-.18167030310763465639212199675966985523700,    .8338762214983713355048859934853420195440,
-.18492233849401198964024217730184318497780,    .8311688311688311688311688311688311688312,
-.18816383241818296356839823602058459073300,    .8284789644012944983818770226537216828479,
-.19139485299962943898322009772527962923050,    .8258064516129032258064516129032258064516,
-.19461546769967164038916962454095482826240,    .8231511254019292604501607717041800643087,
-.19782574332991986754137769821682013571260,    .8205128205128205128205128205128205128205,
-.20102574606059073203390141770796617493040,    .8178913738019169329073482428115015974441,
-.20421554142869088876999228432396193966280,    .8152866242038216560509554140127388535032,
-.20739519434607056602715147164417430758480,    .8126984126984126984126984126984126984127,
-.21056476910734961416338251183333341032260,    .8101265822784810126582278481012658227848,
-.21372432939771812687723695489694364368910,    .8075709779179810725552050473186119873817,
-.21687393830061435506806333251006435602900,    .8050314465408805031446540880503144654088,
-.22001365830528207823135744547471404075630,    .8025078369905956112852664576802507836991,
-.22314355131420973710199007200571941211830,    .8000000000000000000000000000000000000000,
-.22626367865045338145790765338460914790630,    .7975077881619937694704049844236760124611,
-.22937410106484582006380890106811420992010,    .7950310559006211180124223602484472049689,
-.23247487874309405442296849741978803649550,    .7925696594427244582043343653250773993808,
-.23556607131276688371634975283086532726890,    .7901234567901234567901234567901234567901,
-.23864773785017498464178231643018079921600,    .7876923076923076923076923076923076923077,
-.24171993688714515924331749374687206000090,    .7852760736196319018404907975460122699387,
-.24478272641769091566565919038112042471760,    .7828746177370030581039755351681957186544,
-.24783616390458124145723672882013488560910,    .7804878048780487804878048780487804878049,
-.25088030628580937353433455427875742316250,    .7781155015197568389057750759878419452888,
-.25391520998096339667426946107298135757450,    .7757575757575757575757575757575757575758,
-.25694093089750041913887912414793390780680,    .7734138972809667673716012084592145015106,
-.25995752443692604627401010475296061486000,    .7710843373493975903614457831325301204819,
-.26296504550088134477547896494797896593800,    .7687687687687687687687687687687687687688,
-.26596354849713793599974565040611196309330,    .7664670658682634730538922155688622754491,
-.26895308734550393836570947314612567424780,    .7641791044776119402985074626865671641791,
-.27193371548364175804834985683555714786050,    .7619047619047619047619047619047619047619,
-.27490548587279922676529508862586226314300,    .7596439169139465875370919881305637982196,
-.27786845100345625159121709657483734190480,    .7573964497041420118343195266272189349112,
-.28082266290088775395616949026589281857030,    .7551622418879056047197640117994100294985,
-.28376817313064456316240580235898960381750,    .7529411764705882352941176470588235294118,
-.28670503280395426282112225635501090437180,    .7507331378299120234604105571847507331378,
-.28963329258304265634293983566749375313530,    .7485380116959064327485380116959064327485,
-.29255300268637740579436012922087684273730,    .7463556851311953352769679300291545189504,
-.29546421289383584252163927885703742504130,    .7441860465116279069767441860465116279070,
-.29836697255179722709783618483925238251680,    .7420289855072463768115942028985507246377,
-.30126133057816173455023545102449133992200,    .7398843930635838150289017341040462427746,
-.30414733546729666446850615102448500692850,    .7377521613832853025936599423631123919308,
-.30702503529491181888388950937951449304830,    .7356321839080459770114942528735632183908,
-.30989447772286465854207904158101882785550,    .7335243553008595988538681948424068767908,
-.31275571000389684739317885942000430077330,    .7314285714285714285714285714285714285714,
-.31560877898630329552176476681779604405180,    .7293447293447293447293447293447293447293,
-.31845373111853458869546784626436419785030,    .7272727272727272727272727272727272727273,
-.32129061245373424782201254856772720813750,    .7252124645892351274787535410764872521246,
-.32411946865421192853773391107097268104550,    .7231638418079096045197740112994350282486,
-.32694034499585328257253991068864706903700,    .7211267605633802816901408450704225352113,
-.32975328637246797969240219572384376078850,    .7191011235955056179775280898876404494382,
-.33255833730007655635318997155991382896900,    .7170868347338935574229691876750700280112,
-.33535554192113781191153520921943709254280,    .7150837988826815642458100558659217877095,
-.33814494400871636381467055798566434532400,    .7130919220055710306406685236768802228412,
-.34092658697059319283795275623560883104800,    .7111111111111111111111111111111111111111,
-.34370051385331840121395430287520866841080,    .7091412742382271468144044321329639889197,
-.34646676734620857063262633346312213689100,    .7071823204419889502762430939226519337017,
-.34922538978528827602332285096053965389730,    .7052341597796143250688705234159779614325,
-.35197642315717814209818925519357435405250,    .7032967032967032967032967032967032967033,
-.35471990910292899856770532096561510115850,    .7013698630136986301369863013698630136986,
-.35745588892180374385176833129662554711100,    .6994535519125683060109289617486338797814,
-.36018440357500774995358483465679455548530,    .6975476839237057220708446866485013623978,
-.36290549368936841911903457003063522279280,    .6956521739130434782608695652173913043478,
-.36561919956096466943762379742111079394830,    .6937669376693766937669376693766937669377,
-.36832556115870762614150635272380895912650,    .6918918918918918918918918918918918918919,
-.37102461812787262962487488948681857436900,    .6900269541778975741239892183288409703504,
-.37371640979358405898480555151763837784530,    .6881720430107526881720430107526881720430,
-.37640097516425302659470730759494472295050,    .6863270777479892761394101876675603217158,
-.37907835293496944251145919224654790014030,    .6844919786096256684491978609625668449198,
-.38174858149084833769393299007788300514230,    .6826666666666666666666666666666666666667,
-.38441169891033200034513583887019194662580,    .6808510638297872340425531914893617021277,
-.38706774296844825844488013899535872042180,    .6790450928381962864721485411140583554377,
-.38971675114002518602873692543653305619950,    .6772486772486772486772486772486772486772,
-.39235876060286384303665840889152605086580,    .6754617414248021108179419525065963060686,
-.39499380824086893770896722344332374632350,    .6736842105263157894736842105263157894737,
-.39762193064713846624158577469643205404280,    .6719160104986876640419947506561679790026,
-.40024316412701266276741307592601515352730,    .6701570680628272251308900523560209424084,
-.40285754470108348090917615991202183067800,    .6684073107049608355091383812010443864230,
-.40546510810816432934799991016916465014230,    .6666666666666666666666666666666666666667,
-.40806588980822172674223224930756259709600,    .6649350649350649350649350649350649350649,
-.41065992498526837639616360320360399782650,    .6632124352331606217616580310880829015544,
-.41324724855021932601317757871584035456180,    .6614987080103359173126614987080103359173,
-.41582789514371093497757669865677598863850,    .6597938144329896907216494845360824742268,
-.41840189913888381489925905043492093682300,    .6580976863753213367609254498714652956298,
-.42096929464412963239894338585145305842150,    .6564102564102564102564102564102564102564,
-.42353011550580327293502591601281892508280,    .6547314578005115089514066496163682864450,
-.42608439531090003260516141381231136620050,    .6530612244897959183673469387755102040816,
-.42863216738969872610098832410585600882780,    .6513994910941475826972010178117048346056,
-.43117346481837132143866142541810404509300,    .6497461928934010152284263959390862944162,
-.43370832042155937902094819946796633303180,    .6481012658227848101265822784810126582278,
-.43623676677491801667585491486534010618930,    .6464646464646464646464646464646464646465,
-.43875883620762790027214350629947148263450,    .6448362720403022670025188916876574307305,
-.44127456080487520440058801796112675219780,    .6432160804020100502512562814070351758794,
-.44378397241030093089975139264424797147500,    .6416040100250626566416040100250626566416,
-.44628710262841947420398014401143882423650,    .6400000000000000000000000000000000000000,
-.44878398282700665555822183705458883196130,    .6384039900249376558603491271820448877805,
-.45127464413945855836729492693848442286250,    .6368159203980099502487562189054726368159,
-.45375911746712049854579618113348260521900,    .6352357320099255583126550868486352357320,
-.45623743348158757315857769754074979573500,    .6336633663366336633663366336633663366337,
-.45870962262697662081833982483658473938700,    .6320987654320987654320987654320987654321,
-.46117571512217014895185229761409573256980,    .6305418719211822660098522167487684729064,
-.46363574096303250549055974261136725544930,    .6289926289926289926289926289926289926290,
-.46608972992459918316399125615134835243230,    .6274509803921568627450980392156862745098,
-.46853771156323925639597405279346276074650,    .6259168704156479217603911980440097799511,
-.47097971521879100631480241645476780831830,    .6243902439024390243902439024390243902439,
-.47341577001667212165614273544633761048330,    .6228710462287104622871046228710462287105,
-.47584590486996386493601107758877333253630,    .6213592233009708737864077669902912621359,
-.47827014848147025860569669930555392056700,    .6198547215496368038740920096852300242131,
-.48068852934575190261057286988943815231330,    .6183574879227053140096618357487922705314,
-.48310107575113581113157579238759353756900,    .6168674698795180722891566265060240963855,
-.48550781578170076890899053978500887751580,    .6153846153846153846153846153846153846154,
-.48790877731923892879351001283794175833480,    .6139088729016786570743405275779376498801,
-.49030398804519381705802061333088204264650,    .6124401913875598086124401913875598086124,
-.49269347544257524607047571407747454941280,    .6109785202863961813842482100238663484487,
-.49507726679785146739476431321236304938800,    .6095238095238095238095238095238095238095,
-.49745538920281889838648226032091770321130,    .6080760095011876484560570071258907363420,
-.49982786955644931126130359189119189977650,    .6066350710900473933649289099526066350711,
-.50219473456671548383667413872899487614650,    .6052009456264775413711583924349881796690,
-.50455601075239520092452494282042607665050,    .6037735849056603773584905660377358490566,
-.50691172444485432801997148999362252652650,    .6023529411764705882352941176470588235294,
-.50926190178980790257412536448100581765150,    .6009389671361502347417840375586854460094,
-.51160656874906207391973111953120678663250,    .5995316159250585480093676814988290398126,
-.51394575110223428282552049495279788970950,    .5981308411214953271028037383177570093458,
-.51627947444845445623684554448118433356300,    .5967365967365967365967365967365967365967,
-.51860776420804555186805373523384332656850,    .5953488372093023255813953488372093023256,
-.52093064562418522900344441950437612831600,    .5939675174013921113689095127610208816705,
-.52324814376454775732838697877014055848100,    .5925925925925925925925925925925925925926,
-.52556028352292727401362526507000438869000,    .5912240184757505773672055427251732101617,
-.52786708962084227803046587723656557500350,    .5898617511520737327188940092165898617512,
-.53016858660912158374145519701414741575700,    .5885057471264367816091954022988505747126,
-.53246479886947173376654518506256863474850,    .5871559633027522935779816513761467889908,
-.53475575061602764748158733709715306758900,    .5858123569794050343249427917620137299771,
-.53704146589688361856929077475797384977350,    .5844748858447488584474885844748858447489,
-.53932196859560876944783558428753167390800,    .5831435079726651480637813211845102505695,
-.54159728243274429804188230264117009937750,    .5818181818181818181818181818181818181818,
-.54386743096728351609669971367111429572100,    .5804988662131519274376417233560090702948,
-.54613243759813556721383065450936555862450,    .5791855203619909502262443438914027149321,
-.54839232556557315767520321969641372561450,    .5778781038374717832957110609480812641084,
-.55064711795266219063194057525834068655950,    .5765765765765765765765765765765765765766,
-.55289683768667763352766542084282264113450,    .5752808988764044943820224719101123595506,
-.55514150754050151093110798683483153581600,    .5739910313901345291479820627802690582960,
-.55738115013400635344709144192165695130850,    .5727069351230425055928411633109619686801,
-.55961578793542265941596269840374588966350,    .5714285714285714285714285714285714285714,
-.56184544326269181269140062795486301183700,    .5701559020044543429844097995545657015590,
-.56407013828480290218436721261241473257550,    .5688888888888888888888888888888888888889,
-.56628989502311577464155334382667206227800,    .5676274944567627494456762749445676274945,
-.56850473535266865532378233183408156037350,    .5663716814159292035398230088495575221239,
-.57071468100347144680739575051120482385150,    .5651214128035320088300220750551876379691,
-.57291975356178548306473885531886480748650,    .5638766519823788546255506607929515418502,
-.57511997447138785144460371157038025558000,    .5626373626373626373626373626373626373626,
-.57731536503482350219940144597785547375700,    .5614035087719298245614035087719298245614,
-.57950594641464214795689713355386629700650,    .5601750547045951859956236323851203501094,
-.58169173963462239562716149521293118596100,    .5589519650655021834061135371179039301310,
-.58387276558098266665552955601015128195300,    .5577342047930283224400871459694989106754,
-.58604904500357812846544902640744112432000,    .5565217391304347826086956521739130434783,
-.58822059851708596855957011939608491957200,    .5553145336225596529284164859002169197397,
-.59038744660217634674381770309992134571100,    .5541125541125541125541125541125541125541,
-.59254960960667157898740242671919986605650,    .5529157667386609071274298056155507559395,
-.59470710774669277576265358220553025603300,    .5517241379310344827586206896551724137931,
-.59685996110779382384237123915227130055450,    .5505376344086021505376344086021505376344,
-.59900818964608337768851242799428291618800,    .5493562231759656652360515021459227467811,
-.60115181318933474940990890900138765573500,    .5481798715203426124197002141327623126338,
-.60329085143808425240052883964381180703650,    .5470085470085470085470085470085470085470,
-.60542532396671688843525771517306566238400,    .5458422174840085287846481876332622601279,
-.60755525022454170969155029524699784815300,    .5446808510638297872340425531914893617021,
-.60968064953685519036241657886421307921400,    .5435244161358811040339702760084925690021,
-.61180154110599282990534675263916142284850,    .5423728813559322033898305084745762711864,
-.61391794401237043121710712512140162289150,    .5412262156448202959830866807610993657505,
-.61602987721551394351138242200249806046500,    .5400843881856540084388185654008438818565,
-.61813735955507864705538167982012964785100,    .5389473684210526315789473684210526315789,
-.62024040975185745772080281312810257077200,    .5378151260504201680672268907563025210084,
-.62233904640877868441606324267922900617100,    .5366876310272536687631027253668763102725,
-.62443328801189346144440150965237990021700,    .5355648535564853556485355648535564853556,
-.62652315293135274476554741340805776417250,    .5344467640918580375782881002087682672234,
-.62860865942237409420556559780379757285100,    .5333333333333333333333333333333333333333,
-.63068982562619868570408243613201193511500,    .5322245322245322245322245322245322245322,
-.63276666957103777644277897707070223987100,    .5311203319502074688796680497925311203320,
-.63483920917301017716738442686619237065300,    .5300207039337474120082815734989648033126,
-.63690746223706917739093569252872839570050,    .5289256198347107438016528925619834710744,
-.63897144645792069983514238629140891134750,    .5278350515463917525773195876288659793814,
-.64103117942093124081992527862894348800200,    .5267489711934156378600823045267489711934,
-.64308667860302726193566513757104985415950,    .5256673511293634496919917864476386036961,
-.64513796137358470073053240412264131009600,    .5245901639344262295081967213114754098361,
-.64718504499530948859131740391603671014300,    .5235173824130879345603271983640081799591,
-.64922794662510974195157587018911726772800,    .5224489795918367346938775510204081632653,
-.65126668331495807251485530287027359008800,    .5213849287169042769857433808553971486762,
-.65330127201274557080523663898929953575150,    .5203252032520325203252032520325203252033,
-.65533172956312757406749369692988693714150,    .5192697768762677484787018255578093306288,
-.65735807270835999727154330685152672231200,    .5182186234817813765182186234817813765182,
-.65938031808912778153342060249997302889800,    .5171717171717171717171717171717171717172,
-.66139848224536490484126716182800009846700,    .5161290322580645161290322580645161290323,
-.66341258161706617713093692145776003599150,    .5150905432595573440643863179074446680080,
-.66542263254509037562201001492212526500250,    .5140562248995983935742971887550200803213,
-.66742865127195616370414654738851822912700,    .5130260521042084168336673346693386773547,
-.66943065394262923906154583164607174694550,    .5120000000000000000000000000000000000000,
-.67142865660530226534774556057527661323550,    .5109780439121756487025948103792415169661,
-.67342267521216669923234121597488410770900,    .5099601593625498007968127490039840637450,
-.67541272562017662384192817626171745359900,    .5089463220675944333996023856858846918489,
-.67739882359180603188519853574689477682100,    .5079365079365079365079365079365079365079,
-.67938098479579733801614338517538271844400,    .5069306930693069306930693069306930693069,
-.68135922480790300781450241629499942064300,    .5059288537549407114624505928853754940711,
-.68333355911162063645036823800182901322850,    .5049309664694280078895463510848126232742,
-.68530400309891936760919861626462079584600,    .5039370078740157480314960629921259842520,
-.68727057207096020619019327568821609020250,    .5029469548133595284872298624754420432220,
-.68923328123880889251040571252815425395950,    .5019607843137254901960784313725490196078,
-.69314718055994530941723212145818, 5.0e-01,
-};
-
-
-
-#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
-static const double ln_2 = 0.69314718055994530941723212145818;
-
-static void Log_32f( const float *_x, float *y, int n )
-{
-    static const float shift[] = { 0, -1.f/512 };
-    static const float
-        A0 = 0.3333333333333333333333333f,
-        A1 = -0.5f,
-        A2 = 1.f;
-
-    #undef LOGPOLY
-    #define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
-
-    int i = 0;
-    Cv32suf buf[4];
-    const int* x = (const int*)_x;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        static const __m128d ln2_2 = _mm_set1_pd(ln_2);
-        static const __m128 _1_4 = _mm_set1_ps(1.f);
-        static const __m128 shift4 = _mm_set1_ps(-1.f/512);
-
-        static const __m128 mA0 = _mm_set1_ps(A0);
-        static const __m128 mA1 = _mm_set1_ps(A1);
-        static const __m128 mA2 = _mm_set1_ps(A2);
-
-        int CV_DECL_ALIGNED(16) idx[4];
-
-        for( ; i <= n - 4; i += 4 )
-        {
-            __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
-            __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
-            __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
-            __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
-
-            __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
-
-            h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
-            _mm_store_si128((__m128i*)idx, h0);
-            h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
-            __m128d t0, t1, t2, t3, t4;
-            t0 = _mm_load_pd(icvLogTab + idx[0]);
-            t2 = _mm_load_pd(icvLogTab + idx[1]);
-            t1 = _mm_unpackhi_pd(t0, t2);
-            t0 = _mm_unpacklo_pd(t0, t2);
-            t2 = _mm_load_pd(icvLogTab + idx[2]);
-            t4 = _mm_load_pd(icvLogTab + idx[3]);
-            t3 = _mm_unpackhi_pd(t2, t4);
-            t2 = _mm_unpacklo_pd(t2, t4);
-
-            yd0 = _mm_add_pd(yd0, t0);
-            yd1 = _mm_add_pd(yd1, t2);
-
-            __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
-
-            __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
-            xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
-            xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
-
-            __m128 zf0 = _mm_mul_ps(xf0, mA0);
-            zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
-            zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
-            yf0 = _mm_add_ps(yf0, zf0);
-
-            _mm_storeu_ps(y + i, yf0);
-        }
-    }
-    else
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double x0, x1, x2, x3;
-        double y0, y1, y2, y3;
-        int h0, h1, h2, h3;
-
-        h0 = x[i];
-        h1 = x[i+1];
-        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
-        buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23);
-
-        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
-        y1 = (((h1 >> 23) & 0xff) - 127) * ln_2;
-
-        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        y1 += icvLogTab[h1];
-
-        h2 = x[i+2];
-        h3 = x[i+3];
-
-        x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
-        x1 = LOGTAB_TRANSLATE( buf[1].f, h1 );
-
-        buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23);
-        buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23);
-
-        y2 = (((h2 >> 23) & 0xff) - 127) * ln_2;
-        y3 = (((h3 >> 23) & 0xff) - 127) * ln_2;
-
-        h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y2 += icvLogTab[h2];
-        y3 += icvLogTab[h3];
-
-        x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
-        x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
-
-        x0 += shift[h0 == 510];
-        x1 += shift[h1 == 510];
-        y0 += LOGPOLY( x0 );
-        y1 += LOGPOLY( x1 );
-
-        y[i] = (float) y0;
-        y[i + 1] = (float) y1;
-
-        x2 += shift[h2 == 510];
-        x3 += shift[h3 == 510];
-        y2 += LOGPOLY( x2 );
-        y3 += LOGPOLY( x3 );
-
-        y[i + 2] = (float) y2;
-        y[i + 3] = (float) y3;
-    }
-
-    for( ; i < n; i++ )
-    {
-        int h0 = x[i];
-        double y0;
-        float x0;
-
-        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
-
-        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
-        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 );
-        x0 += shift[h0 == 510];
-        y0 += LOGPOLY( x0 );
-
-        y[i] = (float)y0;
-    }
-}
-
-
-static void Log_64f( const double *x, double *y, int n )
-{
-    static const double shift[] = { 0, -1./512 };
-    static const double
-        A7 = 1.0,
-        A6 = -0.5,
-        A5 = 0.333333333333333314829616256247390992939472198486328125,
-        A4 = -0.25,
-        A3 = 0.2,
-        A2 = -0.1666666666666666574148081281236954964697360992431640625,
-        A1 = 0.1428571428571428769682682968777953647077083587646484375,
-        A0 = -0.125;
-
-    #undef LOGPOLY
-    #define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\
-        (((A0*xq + A2)*xq + A4)*xq + A6)*xq + \
-        (((A1*xq + A3)*xq + A5)*xq + A7)*(x))
-
-    int i = 0;
-    DBLINT buf[4];
-    DBLINT *X = (DBLINT *) x;
-
-#if CV_SSE2
-    if( USE_SSE2 )
-    {
-        static const __m128d ln2_2 = _mm_set1_pd(ln_2);
-        static const __m128d _1_2 = _mm_set1_pd(1.);
-        static const __m128d shift2 = _mm_set1_pd(-1./512);
-
-        static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
-        static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
-
-        static const __m128d mA0 = _mm_set1_pd(A0);
-        static const __m128d mA1 = _mm_set1_pd(A1);
-        static const __m128d mA2 = _mm_set1_pd(A2);
-        static const __m128d mA3 = _mm_set1_pd(A3);
-        static const __m128d mA4 = _mm_set1_pd(A4);
-        static const __m128d mA5 = _mm_set1_pd(A5);
-        static const __m128d mA6 = _mm_set1_pd(A6);
-        static const __m128d mA7 = _mm_set1_pd(A7);
-
-        int CV_DECL_ALIGNED(16) idx[4];
-
-        for( ; i <= n - 4; i += 4 )
-        {
-            __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
-            __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
-
-            __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
-            __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
-
-            h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
-
-            __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
-                                    _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
-            __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
-            __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
-
-            h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
-            _mm_store_si128((__m128i*)idx, h0);
-            h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
-
-            __m128d t0, t1, t2, t3, t4;
-            t0 = _mm_load_pd(icvLogTab + idx[0]);
-            t2 = _mm_load_pd(icvLogTab + idx[1]);
-            t1 = _mm_unpackhi_pd(t0, t2);
-            t0 = _mm_unpacklo_pd(t0, t2);
-            t2 = _mm_load_pd(icvLogTab + idx[2]);
-            t4 = _mm_load_pd(icvLogTab + idx[3]);
-            t3 = _mm_unpackhi_pd(t2, t4);
-            t2 = _mm_unpacklo_pd(t2, t4);
-
-            yd0 = _mm_add_pd(yd0, t0);
-            yd1 = _mm_add_pd(yd1, t2);
-
-            xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
-            xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
-
-            xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
-            xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
-
-            __m128d zd0 = _mm_mul_pd(xd0, mA0);
-            __m128d zd1 = _mm_mul_pd(xd1, mA0);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
-            zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
-            zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
-
-            yd0 = _mm_add_pd(yd0, zd0);
-            yd1 = _mm_add_pd(yd1, zd1);
-
-            _mm_storeu_pd(y + i, yd0);
-            _mm_storeu_pd(y + i + 2, yd1);
-        }
-    }
-    else
-#endif
-    for( ; i <= n - 4; i += 4 )
-    {
-        double xq;
-        double x0, x1, x2, x3;
-        double y0, y1, y2, y3;
-        int h0, h1, h2, h3;
-
-        h0 = X[i].i.lo;
-        h1 = X[i + 1].i.lo;
-        buf[0].i.lo = h0;
-        buf[1].i.lo = h1;
-
-        h0 = X[i].i.hi;
-        h1 = X[i + 1].i.hi;
-        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
-        buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20);
-
-        y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
-        y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        h2 = X[i + 2].i.lo;
-        h3 = X[i + 3].i.lo;
-        buf[2].i.lo = h2;
-        buf[3].i.lo = h3;
-
-        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        y1 += icvLogTab[h1];
-
-        h2 = X[i + 2].i.hi;
-        h3 = X[i + 3].i.hi;
-
-        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
-        x1 = LOGTAB_TRANSLATE( buf[1].d, h1 );
-
-        buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20);
-        buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20);
-
-        y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2;
-        y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-        h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y2 += icvLogTab[h2];
-        y3 += icvLogTab[h3];
-
-        x2 = LOGTAB_TRANSLATE( buf[2].d, h2 );
-        x3 = LOGTAB_TRANSLATE( buf[3].d, h3 );
-
-        y0 += LOGPOLY( x0, h0 == 510 );
-        y1 += LOGPOLY( x1, h1 == 510 );
-
-        y[i] = y0;
-        y[i + 1] = y1;
-
-        y2 += LOGPOLY( x2, h2 == 510 );
-        y3 += LOGPOLY( x3, h3 == 510 );
-
-        y[i + 2] = y2;
-        y[i + 3] = y3;
-    }
-
-    for( ; i < n; i++ )
-    {
-        int h0 = X[i].i.hi;
-        double xq;
-        double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
-
-        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
-        buf[0].i.lo = X[i].i.lo;
-        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
-
-        y0 += icvLogTab[h0];
-        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
-        y0 += LOGPOLY( x0, h0 == 510 );
-        y[i] = y0;
-    }
-}
-
 #ifdef HAVE_IPP
 static void Log_32f_ipp(const float *x, float *y, int n)
 {
@@ -2256,7 +826,7 @@ static void Log_32f_ipp(const float *x, float *y, int n)
         }
         setIppErrorStatus();
     }
-    Log_32f(x, y, n);
+    hal::log(x, y, n);
 }
 
 static void Log_64f_ipp(const double *x, double *y, int n)
@@ -2270,11 +840,14 @@ static void Log_64f_ipp(const double *x, double *y, int n)
         }
         setIppErrorStatus();
     }
-    Log_64f(x, y, n);
+    hal::log(x, y, n);
 }
 
 #define Log_32f Log_32f_ipp
 #define Log_64f Log_64f_ipp
+#else
+#define Log_32f hal::log
+#define Log_64f hal::log
 #endif
 
 void log( InputArray _src, OutputArray _dst )
@@ -2651,6 +1224,11 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
 
 #endif
 
+static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt(src, dst, n); }
+static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt(src, dst, n); }
+static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt(src, dst, n); }
+static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt(src, dst, n); }
+
 void pow( InputArray _src, double power, OutputArray _dst )
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type),
@@ -3085,27 +1663,6 @@ void patchNaNs( InputOutputArray _a, double _val )
     }
 }
 
-
-void exp(const float* src, float* dst, int n)
-{
-    Exp_32f(src, dst, n);
-}
-
-void log(const float* src, float* dst, int n)
-{
-    Log_32f(src, dst, n);
-}
-
-void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees)
-{
-    FastAtan2_32f(y, x, dst, n, angleInDegrees);
-}
-
-void magnitude(const float* x, const float* y, float* dst, int n)
-{
-    Magnitude_32f(x, y, dst, n);
-}
-
 }
 
 CV_IMPL float cvCbrt(float value) { return cv::cubeRoot(value); }
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index ca707e78a..e43df9444 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -2416,140 +2416,6 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
 namespace cv
 {
 
-float normL2Sqr_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
-            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
-            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
-        }
-    }
-
-    for( ; j < n; j++ )
-    {
-        float t = a[j] - b[j];
-        d += t*t;
-    }
-    return d;
-}
-
-
-float normL1_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-        __m128 absmask = _mm_load_ps((const float*)absbuf);
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
-            d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#elif CV_NEON
-    float32x4_t v_sum = vdupq_n_f32(0.0f);
-    for ( ; j <= n - 4; j += 4)
-        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
-
-    float CV_DECL_ALIGNED(16) buf[4];
-    vst1q_f32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
-int normL1_(const uchar* a, const uchar* b, int n)
-{
-    int j = 0, d = 0;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        __m128i d0 = _mm_setzero_si128();
-
-        for( ; j <= n - 16; j += 16 )
-        {
-            __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
-            __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
-
-            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-        }
-
-        for( ; j <= n - 4; j += 4 )
-        {
-            __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
-            __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
-
-            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-        }
-        d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
-    }
-    else
-#elif CV_NEON
-    uint32x4_t v_sum = vdupq_n_u32(0.0f);
-    for ( ; j <= n - 16; j += 16)
-    {
-        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
-        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
-    }
-
-    uint CV_DECL_ALIGNED(16) buf[4];
-    vst1q_u32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
@@ -2564,7 +2430,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
             if( mask[i] )
             {
                 for( int k = 0; k < cn; k++ )
-                    result = std::max(result, ST(std::abs(src[k])));
+                    result = std::max(result, ST(cv_abs(src[k])));
             }
     }
     *_result = result;
@@ -2585,7 +2451,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
             if( mask[i] )
             {
                 for( int k = 0; k < cn; k++ )
-                    result += std::abs(src[k]);
+                    result += cv_abs(src[k]);
             }
     }
     *_result = result;
@@ -2684,9 +2550,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
 
 Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
 {
-    int result = 0;
-    cv::hal::normHamming(a, b, size, result);
-    return result;
+    return cv::hal::normHamming(a, b, size);
 }
 
 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
@@ -3037,16 +2901,12 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
 
                 if( normType == NORM_HAMMING )
                 {
-                    int result = 0;
-                    cv::hal::normHamming(data, (int)len, result);
-                    return result;
+                    return hal::normHamming(data, (int)len);
                 }
 
                 if( normType == NORM_HAMMING2 )
                 {
-                    int result = 0;
-                    hal::normHamming(data, (int)len, 2, result);
-                    return result;
+                    return hal::normHamming(data, (int)len, 2);
                 }
             }
         }
@@ -3072,9 +2932,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
-            int one = 0;
-            cv::hal::normHamming(ptrs[0], total, cellSize, one);
-            result += one;
+            result += hal::normHamming(ptrs[0], total, cellSize);
         }
 
         return result;
@@ -3558,9 +3416,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
-            int one = 0;
-            hal::normHamming(ptrs[0], ptrs[1], total, cellSize, one);
-            result += one;
+            result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
         }
 
         return result;
@@ -3698,7 +3554,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
     if( !mask )
     {
         for( int i = 0; i < nvecs; i++ )
-             hal::normHamming(src1, src2 + step2*i, len, dist[i]);
+             dist[i] = hal::normHamming(src1, src2 + step2*i, len);
     }
     else
     {
@@ -3706,7 +3562,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
         for( int i = 0; i < nvecs; i++ )
         {
             if (mask[i])
-                hal::normHamming(src1, src2 + step2*i, len, dist[i]);
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len);
             else
                 dist[i] = val0;
         }
@@ -3720,7 +3576,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
     if( !mask )
     {
         for( int i = 0; i < nvecs; i++ )
-            hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]);
+            dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
     }
     else
     {
@@ -3728,7 +3584,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
         for( int i = 0; i < nvecs; i++ )
         {
             if (mask[i])
-                hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]);
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
             else
                 dist[i] = val0;
         }
diff --git a/modules/features2d/src/kaze/AKAZEFeatures.cpp b/modules/features2d/src/kaze/AKAZEFeatures.cpp
index fd15345b2..d12656e99 100644
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -812,7 +812,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE
       }
     }
   }
-  fastAtan2(resY, resX, Ang, ang_size, false);
+  hal::fastAtan2(resY, resX, Ang, ang_size, false);
   // Loop slides pi/3 window around feature point
   for (ang1 = 0; ang1 < (float)(2.0 * CV_PI); ang1 += 0.15f) {
     ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp
index 7641c46ca..95d1ac66c 100644
--- a/modules/hal/include/opencv2/hal.hpp
+++ b/modules/hal/include/opencv2/hal.hpp
@@ -81,28 +81,17 @@ float normL1_(const float* a, const float* b, int n);
 float normL2Sqr_(const float* a, const float* b, int n);
 
 void exp(const float* src, float* dst, int n);
+void exp(const double* src, double* dst, int n);
 void log(const float* src, float* dst, int n);
+void log(const double* src, double* dst, int n);
 
 void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
 void magnitude(const float* x, const float* y, float* dst, int n);
-
-/** @brief Computes the cube root of an argument.
-
- The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
- NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
- single-precision data.
- @param val A function argument.
- */
-float cubeRoot(float val);
-
-/** @brief Calculates the angle of a 2D vector in degrees.
-
- The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
- in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
- @param x x-coordinate of the vector.
- @param y y-coordinate of the vector.
- */
-float fastAtan2(float y, float x);
+void magnitude(const double* x, const double* y, double* dst, int n);
+void sqrt(const float* src, float* dst, int len);
+void sqrt(const double* src, double* dst, int len);
+void invSqrt(const float* src, float* dst, int len);
+void invSqrt(const double* src, double* dst, int len);
 
 }} //cv::hal
 
diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h
index 6e1ff2a0a..c011fe617 100644
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -380,7 +380,7 @@ cvRound( double value )
     TEGRA_ROUND_DBL(value);
 #elif defined CV_ICC || defined __GNUC__
 # if CV_VFP
-    ARM_ROUND_DBL(value)
+    ARM_ROUND_DBL(value);
 # else
     return (int)lrint(value);
 # endif
@@ -488,7 +488,7 @@ CV_INLINE int cvRound(float value)
     TEGRA_ROUND_FLT(value);
 #elif defined CV_ICC || defined __GNUC__
 # if CV_VFP
-    ARM_ROUND_FLT(value)
+    ARM_ROUND_FLT(value);
 # else
     return (int)lrintf(value);
 # endif
diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
index b7b147a19..c3c47e059 100644
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -438,6 +438,14 @@ OPENCV_HAL_IMPL_ADDSUB_OP(v_add_wrap, +, (_Tp), _Tp)
 OPENCV_HAL_IMPL_ADDSUB_OP(v_sub_wrap, -, (_Tp), _Tp)
 OPENCV_HAL_IMPL_ADDSUB_OP(v_absdiff, -, (rtype)std::abs, typename TypeTraits<_Tp>::abs_type)
 
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = 1.f/std::sqrt(a.s[i]);
+    return c;
+}
+
 template<typename _Tp, int n> inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
     v_reg<_Tp, n> c;
@@ -446,6 +454,7 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp,
     return c;
 }
 
+
 template<typename _Tp, int n> inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
 {
     v_reg<_Tp, n> c;
@@ -544,7 +553,7 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
 }
 
 template<typename _Tp, int n> inline v_reg<typename TypeTraits<_Tp>::int_type, n>
-    v_reinterpret_int(const v_reg<_Tp, n>& a)
+    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
 {
     v_reg<typename TypeTraits<_Tp>::int_type, n> c;
     for( int i = 0; i < n; i++ )
@@ -553,7 +562,7 @@ template<typename _Tp, int n> inline v_reg<typename TypeTraits<_Tp>::int_type, n
 }
 
 template<typename _Tp, int n> inline v_reg<typename TypeTraits<_Tp>::uint_type, n>
-    v_reinterpret_uint(const v_reg<_Tp, n>& a)
+    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
 {
     v_reg<typename TypeTraits<_Tp>::uint_type, n> c;
     for( int i = 0; i < n; i++ )
@@ -944,7 +953,7 @@ struct v_uint16x8
     {
         return (ushort)_mm_extract_epi16(val, i);
     }
-    uchar get0() const
+    ushort get0() const
     {
         return (ushort)_mm_cvtsi128_si32(val);
     }
@@ -1069,79 +1078,79 @@ inline v_int32x4 v_setall_s32(int v) { return v_int32x4(_mm_set1_epi32(v)); }
 inline v_float32x4 v_setall_f32(float v) { return v_float32x4(_mm_set1_ps(v)); }
 inline v_float64x2 v_setall_f64(double v) { return v_float64x2(_mm_set1_pd(v)); }
 
-template<typename _Tpvec> inline v_uint8x16 v_reinterpret_u8(const _Tpvec& a)
+template<typename _Tpvec> inline v_uint8x16 v_reinterpret_as_u8(const _Tpvec& a)
 { return v_uint8x16(a.val); }
 
-inline v_uint8x16 v_reinterpret_u8(const v_float32x4& a)
+inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& a)
 { return v_uint8x16(_mm_castps_si128(a.val)); }
 
-inline v_uint8x16 v_reinterpret_u8(const v_float64x2& a)
+inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& a)
 { return v_uint8x16(_mm_castpd_si128(a.val)); }
 
-template<typename _Tpvec> inline v_int8x16 v_reinterpret_s8(const _Tpvec& a)
+template<typename _Tpvec> inline v_int8x16 v_reinterpret_as_s8(const _Tpvec& a)
 { return v_int8x16(a.val); }
 
-inline v_int8x16 v_reinterpret_s8(const v_float32x4& a)
+inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& a)
 { return v_int8x16(_mm_castps_si128(a.val)); }
 
-inline v_int8x16 v_reinterpret_s8(const v_float64x2& a)
+inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& a)
 { return v_int8x16(_mm_castpd_si128(a.val)); }
 
-template<typename _Tpvec> inline v_uint16x8 v_reinterpret_u16(const _Tpvec& a)
+template<typename _Tpvec> inline v_uint16x8 v_reinterpret_as_u16(const _Tpvec& a)
 { return v_uint16x8(a.val); }
 
-inline v_uint16x8 v_reinterpret_u16(const v_float32x4& a)
+inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& a)
 { return v_uint16x8(_mm_castps_si128(a.val)); }
 
-inline v_uint16x8 v_reinterpret_u16(const v_float64x2& a)
+inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& a)
 { return v_uint16x8(_mm_castpd_si128(a.val)); }
 
-template<typename _Tpvec> inline v_int16x8 v_reinterpret_s16(const _Tpvec& a)
+template<typename _Tpvec> inline v_int16x8 v_reinterpret_as_s16(const _Tpvec& a)
 { return v_int16x8(a.val); }
 
-inline v_int16x8 v_reinterpret_s16(const v_float32x4& a)
+inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& a)
 { return v_int16x8(_mm_castps_si128(a.val)); }
 
-inline v_int16x8 v_reinterpret_s16(const v_float64x2& a)
+inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& a)
 { return v_int16x8(_mm_castpd_si128(a.val)); }
 
-template<typename _Tpvec> inline v_uint32x4 v_reinterpret_u32(const _Tpvec& a)
+template<typename _Tpvec> inline v_uint32x4 v_reinterpret_as_u32(const _Tpvec& a)
 { return v_uint32x4(a.val); }
 
-inline v_uint32x4 v_reinterpret_u32(const v_float32x4& a)
+inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& a)
 { return v_uint32x4(_mm_castps_si128(a.val)); }
 
-inline v_uint32x4 v_reinterpret_u32(const v_float64x2& a)
+inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& a)
 { return v_uint32x4(_mm_castpd_si128(a.val)); }
 
-template<typename _Tpvec> inline v_int32x4 v_reinterpret_s32(const _Tpvec& a)
+template<typename _Tpvec> inline v_int32x4 v_reinterpret_as_s32(const _Tpvec& a)
 { return v_int32x4(a.val); }
 
-inline v_int32x4 v_reinterpret_s32(const v_float32x4& a)
+inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& a)
 { return v_int32x4(_mm_castps_si128(a.val)); }
 
-inline v_int32x4 v_reinterpret_s32(const v_float64x2& a)
+inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& a)
 { return v_int32x4(_mm_castpd_si128(a.val)); }
 
-template<typename _Tpvec> inline v_float32x4 v_reinterpret_f32(const _Tpvec& a)
+template<typename _Tpvec> inline v_float32x4 v_reinterpret_as_f32(const _Tpvec& a)
 { return v_float32x4(_mm_castsi128_ps(a.val)); }
 
-inline v_float32x4 v_reinterpret_f32(const v_float64x2& a)
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a)
 { return v_float32x4(_mm_castpd_ps(a.val)); }
 
-template<typename _Tpvec> inline v_float64x2 v_reinterpret_f64(const _Tpvec& a)
+template<typename _Tpvec> inline v_float64x2 v_reinterpret_as_f64(const _Tpvec& a)
 { return v_float64x2(_mm_castsi128_pd(a.val)); }
 
-inline v_float64x2 v_reinterpret_f64(const v_float64x2& a)
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a)
 { return v_float64x2(_mm_castps_pd(a.val)); }
 
-inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b)
+inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b)
 {
     __m128i delta = _mm_set1_epi16(255);
     return v_uint8x16(_mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta),
                                        _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta)));
 }
-inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n)
+inline v_uint8x16 v_shiftn_u16(const v_uint16x8& a, const v_uint16x8& b, int n)
 {
     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
@@ -1149,81 +1158,53 @@ inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n)
                                        _mm_srli_epi16(_mm_add_epi16(b.val, delta), n)));
 }
 
-inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b)
+inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b)
 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
-inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b, int n)
+inline v_uint8x16 v_shiftun_s16(const v_int16x8& a, const v_int16x8& b, int n)
 {
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
     return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n),
                                        _mm_srai_epi16(_mm_add_epi16(b.val, delta), n)));
 }
 
-inline void v_storesat_u8(uchar* ptr, const v_uint16x8& a)
+inline void v_storen_u16(uchar* ptr, const v_uint16x8& a)
 {
     __m128i delta = _mm_set1_epi16(255);
     _mm_storel_epi64((__m128i*)ptr,
                      _mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta));
 }
 
-inline void v_storesat_u8(uchar* ptr, const v_uint16x8& a, int n)
+inline void v_shiftstoren_u16(uchar* ptr, const v_uint16x8& a, int n)
 {
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
     _mm_storel_epi64((__m128i*)ptr,
                      _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta));
 }
 
-inline void v_storesat_u8(uchar* ptr, const v_int16x8& a)
+inline void v_storeun_s16(uchar* ptr, const v_int16x8& a)
 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
 
-inline void v_storesat_u8(uchar* ptr, const v_int16x8& a, int n)
+inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& a, int n)
 {
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
     _mm_storel_epi64((__m128i*)ptr,
         _mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta));
 }
 
-inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b)
-{
-    __m128i delta = _mm_set1_epi16(127);
-    return v_int8x16(_mm_packs_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta),
-                                       _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta)));
-}
-
-inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b, int n)
-{
-    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
-    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
-    return v_int8x16(_mm_packs_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n),
-                                       _mm_srli_epi16(_mm_add_epi16(b.val, delta), n)));
-}
-
-inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b)
+inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b)
 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
 
-inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b, int n)
+inline v_int8x16 v_shiftn_s16(const v_int16x8& a, const v_int16x8& b, int n)
 {
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
     return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n),
                                        _mm_srai_epi16(_mm_add_epi16(b.val, delta), n)));
 }
 
-inline void v_storesat_s8(schar* ptr, const v_uint16x8& a)
-{
-    __m128i delta = _mm_set1_epi16(127);
-    _mm_storel_epi64((__m128i*)ptr,
-                     _mm_packs_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta));
-}
-
-inline void v_storesat_s8(schar* ptr, const v_uint16x8& a, int n)
-{
-    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
-    _mm_storel_epi64((__m128i*)ptr,
-                     _mm_packs_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta));
-}
-inline void v_storesat_s8(schar* ptr, const v_int16x8& a)
+inline void v_storen_s16(schar* ptr, const v_int16x8& a)
 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
 
-inline void v_storesat_s8(schar* ptr, const v_int16x8& a, int n)
+inline void v_shiftstoren_s16(schar* ptr, const v_int16x8& a, int n)
 {
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
     _mm_storel_epi64((__m128i*)ptr,
@@ -1236,7 +1217,7 @@ inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
     return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
 }
 
-inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b)
+inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b)
 {
     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
@@ -1244,20 +1225,20 @@ inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b)
     __m128i r = _mm_packs_epi32(a1, b1);
     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
 }
-inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b, int n)
+inline v_uint16x8 v_shiftn_u32(const v_uint32x4& a, const v_uint32x4& b, int n)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
     __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
     return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
 }
-inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b)
+inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b)
 {
     __m128i delta32 = _mm_set1_epi32(32768);
     __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
 }
-inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n)
+inline v_uint16x8 v_shiftun_s32(const v_int32x4& a, const v_int32x4& b, int n)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
@@ -1265,28 +1246,28 @@ inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n)
     return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
 }
 
-inline void v_storesat_u16(ushort* ptr, const v_uint32x4& a)
+inline void v_storen_u32(ushort* ptr, const v_uint32x4& a)
 {
     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
     __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
     _mm_storel_epi64((__m128i*)ptr, r);
 }
-inline void v_storesat_u16(ushort* ptr, const v_uint32x4& a, int n)
+inline void v_shiftstoren_u32(ushort* ptr, const v_uint32x4& a, int n)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
     __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
     _mm_storel_epi64((__m128i*)ptr, r);
 }
-inline void v_storesat_u16(ushort* ptr, const v_int32x4& a)
+inline void v_storeun_s32(ushort* ptr, const v_int32x4& a)
 {
     __m128i delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(a.val, delta32);
     __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
     _mm_storel_epi64((__m128i*)ptr, r);
 }
-inline void v_storesat_u16(ushort* ptr, const v_int32x4& a, int n)
+inline void v_shiftstoreun_s32(ushort* ptr, const v_int32x4& a, int n)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
@@ -1294,45 +1275,20 @@ inline void v_storesat_u16(ushort* ptr, const v_int32x4& a, int n)
     _mm_storel_epi64((__m128i*)ptr, r);
 }
 
-inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b)
-{
-    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(32767);
-    __m128i a1 = v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val);
-    __m128i b1 = v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val);
-    return v_int16x8(_mm_packs_epi32(a1, b1));
-}
-inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b, int n)
-{
-    __m128i delta = _mm_set1_epi32(1 << (n-1));
-    return v_int16x8(_mm_packs_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n),
-                                     _mm_srli_epi32(_mm_add_epi32(b.val, delta), n)));
-}
-inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b)
+inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b)
 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
-inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b, int n)
+inline v_int16x8 v_shiftn_s32(const v_int32x4& a, const v_int32x4& b, int n)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1));
     return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
                                      _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
 }
 
-inline void v_storesat_s16(short* ptr, const v_uint32x4& a)
-{
-    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(32767);
-    __m128i a1 = v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val);
-    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
-}
-inline void v_storesat_s16(short* ptr, const v_uint32x4& a, int n)
-{
-    __m128i delta = _mm_set1_epi32(1 << (n-1));
-    __m128i a1 = _mm_srli_epi32(_mm_add_epi32(a.val, delta), n);
-    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
-}
-inline void v_storesat_s16(short* ptr, const v_int32x4& a)
+inline void v_storen_s32(short* ptr, const v_int32x4& a)
 {
     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
 }
-inline void v_storesat_s16(short* ptr, const v_int32x4& a, int n)
+inline void v_shiftstoren_s32(short* ptr, const v_int32x4& a, int n)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1));
     __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
@@ -1373,6 +1329,8 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
@@ -1421,9 +1379,26 @@ OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1
 
 inline v_float32x4 v_sqrt(v_float32x4 x)
 { return v_float32x4(_mm_sqrt_ps(x.val)); }
+
+inline v_float32x4 v_invsqrt(v_float32x4 x)
+{
+    static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
+    __m128 t = x.val;
+    __m128 h = _mm_mul_ps(t, _0_5);
+    t = _mm_rsqrt_ps(t);
+    t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
+    return v_float32x4(t);
+}
+
 inline v_float64x2 v_sqrt(v_float64x2 x)
 { return v_float64x2(_mm_sqrt_pd(x.val)); }
 
+inline v_float64x2 v_invsqrt(v_float64x2 x)
+{
+    static const __m128d v_1 = _mm_set1_pd(1.);
+    return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
+}
+
 inline v_float32x4 v_abs(v_float32x4 x)
 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
 inline v_float64x2 v_abs(v_float64x2 x)
@@ -1893,7 +1868,7 @@ inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
     __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
     __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
     __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
-    __m128i t3 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
+    __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
 \
     b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
     b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
@@ -2074,42 +2049,682 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 
 struct v_uint8x16
 {
+    explicit v_uint8x16(uint8x16_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_u8(v);
+    }
+    uchar get0() const
+    {
+        return vgetq_lane_u8(val, 0);
+    }
+
     uint8x16_t val;
 };
 
 struct v_int8x16
 {
+    explicit v_int8x16(int8x16_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_s8(v);
+    }
+    schar get0() const
+    {
+        return vgetq_lane_s8(val, 0);
+    }
+
     int8x16_t val;
 };
 
 struct v_uint16x8
 {
+    explicit v_uint16x8(uint16x8_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_u16(v);
+    }
+    ushort get0() const
+    {
+        return vgetq_lane_u16(val, 0);
+    }
+
     uint16x8_t val;
 };
 
 struct v_int16x8
 {
+    explicit v_int16x8(int16x8_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_s16(v);
+    }
+    short get0() const
+    {
+        return vgetq_lane_s16(val, 0);
+    }
+
     int16x8_t val;
 };
 
 struct v_uint32x4
 {
+    explicit v_uint32x4(uint32x4_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = vld1q_u32(v);
+    }
+    unsigned get0() const
+    {
+        return vgetq_lane_u32(val, 0);
+    }
+
     uint32x4_t val;
 };
 
 struct v_int32x4
 {
+    explicit v_int32x4(int32x4_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = vld1q_s32(v);
+    }
+    int get0() const
+    {
+        return vgetq_lane_s32(val, 0);
+    }
     int32x4_t val;
 };
 
 struct v_float32x4
 {
+    explicit v_float32x4(float32x4_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = vld1q_f32(v);
+    }
+    float get0() const
+    {
+        return vgetq_lane_f32(val, 0);
+    }
     float32x4_t val;
 };
 
 typedef v_reg<double, 2> v_float64x2;
 typedef v_reg<double, 4> v_float64x4;
 
+#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
+inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
+
+OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
+
+inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint8x8_t a1 = vqmovn_u16(a.val), b1 = vqmovn_u16(b.val);
+    return v_uint8x16(vcombine_u8(a1, b1));
+}
+inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b)
+{
+    uint8x8_t a1 = vqmovun_s16(a.val), b1 = vqmovun_s16(b.val);
+    return v_uint8x16(vcombine_u8(a1, b1));
+}
+inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b)
+{
+    int8x8_t a1 = vqmovn_s16(a.val), b1 = vqmovn_s16(b.val);
+    return v_int8x16(vcombine_s8(a1, b1));
+}
+inline void v_storen_u16(uchar* ptr, const v_uint16x8& a) { vst1_u8(ptr, vqmovn_u16(a.val)); }
+inline void v_storeun_s16(uchar* ptr, const v_int16x8& a) { vst1_u8(ptr, vqmovun_s16(a.val)); }
+inline void v_storen_s16(schar* ptr, const v_int16x8& a) { vst1_s8(ptr, vqmovn_s16(a.val)); }
+
+inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b)
+{
+    uint16x4_t a1 = vqmovn_u32(a.val), b1 = vqmovn_u32(b.val);
+    return v_uint16x8(vcombine_u16(a1, b1));
+}
+inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b)
+{
+    uint16x4_t a1 = vqmovun_s32(a.val), b1 = vqmovun_s32(b.val);
+    return v_uint16x8(vcombine_u16(a1, b1));
+}
+inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b)
+{
+    int16x4_t a1 = vqmovn_s32(a.val), b1 = vqmovn_s32(b.val);
+    return v_int16x8(vcombine_s16(a1, b1));
+}
+inline void v_storen_u32(ushort* ptr, const v_uint32x4& a) { vst1_u16(ptr, vqmovn_u32(a.val)); }
+inline void v_storeun_s32(ushort* ptr, const v_int32x4& a) { vst1_u16(ptr, vqmovun_s32(a.val)); }
+inline void v_storen_s32(short* ptr, const v_int32x4& a) { vst1_s16(ptr, vqmovn_s32(a.val)); }
+
+#define v_shiftn_u16(a, b, n) v_uint8x16(vcombine_u8(vqrshrn_n_u16((a).val, (n)), vqrshrn_n_u16((b).val, (n))))
+#define v_shiftn_s16(a, b, n) v_int8x16(vcombine_s8(vqrshrn_n_s16((a).val, (n)), vqrshrn_n_s16((b).val, (n))))
+#define v_shiftn_u32(a, b, n) v_uint16x8(vcombine_u16(vqrshrn_n_u32((a).val, (n)), vqrshrn_n_u32((b).val, (n))))
+#define v_shiftn_s32(a, b, n) v_int16x8(vcombine_s16(vqrshrn_n_s32((a).val, (n)), vqrshrn_n_s32((b).val, (n))))
+#define v_shiftun_s16(a, b, n) v_uint8x16(vcombine_u8(vqrshrun_n_s16((a).val, (n)), vqrshrun_n_s16((b).val, (n))))
+#define v_shiftun_s32(a, b, n) v_uint16x8(vcombine_u16(vqrshrun_n_s32((a).val, (n)), vqrshrun_n_s32((b).val, (n))))
+#define v_shiftstoren_u16(a, n) vst1_u8(vqrshrn_n_u16((a).val, (n)))
+#define v_shiftstoren_s16(a, n) vst1_s8(vqrshrn_n_s16((a).val, (n)))
+#define v_shiftstoreun_s16(a, n) vst1_u8(vqrshrun_n_s16((a).val, (n)))
+#define v_shiftstoren_u32(a, n) vst1_u16(vqrshrn_n_u32((a).val, (n)))
+#define v_shiftstoren_s32(a, n) vst1_s16(vqrshrn_n_s32((a).val, (n)))
+#define v_shiftstoreun_s32(a, n) vst1_u16(vqrshrun_n_s32((a).val, (n)))
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
+    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
+    res = vmlaq_lane_f32(res, m1.val, vl, 1);
+    res = vmlaq_lane_f32(res, m2.val, vh, 0);
+    res = vmlaq_lane_f32(res, m3.val, vh, 1);
+    return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
+
+inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    return v_float32x4(vmulq_f32(a.val, reciprocal));
+}
+inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    a.val = vmulq_f32(a.val, reciprocal);
+    return a;
+}
+
+#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(vmvnq_##suffix(a.val)); \
+    }
+
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
+
+#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
+}
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    float32x4_t e = vrsqrteq_f32(x.val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e);
+    return v_float32x4(vmulq_f32(e, x.val));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    float32x4_t e = vrsqrteq_f32(x.val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e);
+    return v_float32x4(e);
+}
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{ return v_float32x4(vabsq_f32(x.val)); }
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
+
+
+#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, OPENCV_HAL_NOP, s8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, OPENCV_HAL_NOP, s16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, OPENCV_HAL_NOP, s32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
+
+// TODO: absdiff for signed integers
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+}
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##suffix((_Tp)n))); } \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##suffix((_Tp)-n))); }
+
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, int, s32)
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mullo, vmulq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mullo, vmulq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mulhi2, vqrdmulhq_s16)
+
+#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
+
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
+
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+inline int v_signmask(const v_uint8x16& a)
+{
+    uint8x8_t m0 = vcreate_u8(CV_BIG_UINT(0x0706050403020100));
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_u8(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+    uint16x4_t m0 = vcreate_u16(CV_BIG_UINT(0x0003000200010000));
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_u16(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    uint32x2_t m0 = vcreate_u32(CV_BIG_UINT(0x0000000100000000));
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_u32(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(v0);
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
+    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
+    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
+}
+
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
+
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
+    uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
+    return v_uint32x4(vmovl_u16(v1));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
+    int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
+    return v_int32x4(vmovl_s16(v1));
+}
+
+#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
+    b0.val = p.val[0]; \
+    b1.val = p.val[1]; \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
+    d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
+        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+
+    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
+    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
+    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
+    return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(vcvtq_s32_f32(a.val)); }
+
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* m00 m01 m02 m03 */ \
+    /* m10 m11 m12 m13 */ \
+    /* m20 m21 m22 m23 */ \
+    /* m30 m31 m32 m33 */ \
+    _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
+    _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
+    /* m00 m10 m02 m12 */ \
+    /* m01 m11 m03 m13 */ \
+    /* m20 m30 m22 m32 */ \
+    /* m21 m31 m23 m33 */ \
+    b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
+    b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
+    b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
+    b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
+
+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v = vld3q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v = vld4q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+    d.val = v.val[3]; \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    vst3q_##suffix(ptr, v); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    v.val[3] = d.val; \
+    vst4q_##suffix(ptr, v); \
+}
+
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vcvtq_f32_s32(a.val));
+}
+
 #else
 
 typedef v_reg<uchar, 16> v_uint8x16;
@@ -2141,100 +2756,84 @@ inline v_int32x4 v_setall_s32(int v) { return v_int32x4::all(v); }
 inline v_float32x4 v_setall_f32(float v) { return v_float32x4::all(v); }
 inline v_float64x2 v_setall_f64(double v) { return v_float64x2::all(v); }
 
-template<typename _Tp, int n> inline v_uint8x16 v_reinterpret_u8(const v_reg<_Tp, n>& a)
+template<typename _Tp, int n> inline v_uint8x16 v_reinterpret_as_u8(const v_reg<_Tp, n>& a)
 { return v_reg<_Tp, n>::template reinterpret_as<uchar, 16>(a); }
 
-template<typename _Tp, int n> inline v_int8x16 v_reinterpret_s8(const v_reg<_Tp, n>& a)
+template<typename _Tp, int n> inline v_int8x16 v_reinterpret_as_s8(const v_reg<_Tp, n>& a)
 { return v_reg<_Tp, n>::template reinterpret_as<schar, 16>(a); }
 
-template<typename _Tp, int n> inline v_uint16x8 v_reinterpret_u16(const v_reg<_Tp, n>& a)
+template<typename _Tp, int n> inline v_uint16x8 v_reinterpret_as_u16(const v_reg<_Tp, n>& a)
 { return v_reg<_Tp, n>::template reinterpret_as<ushort, 8>(a); }
 
-template<typename _Tp, int n> inline v_int16x8 v_reinterpret_s16(const v_reg<_Tp, n>& a)
+template<typename _Tp, int n> inline v_int16x8 v_reinterpret_as_s16(const v_reg<_Tp, n>& a)
 { return v_reg<_Tp, n>::template reinterpret_as<short, 8>(a); }
 
-template<typename _Tp, int n> inline v_uint32x4 v_reinterpret_u32(const v_reg<_Tp, n>& a)
+template<typename _Tp, int n> inline v_uint32x4 v_reinterpret_as_u32(const v_reg<_Tp, n>& a)
 { return v_reg<_Tp, n>::template reinterpret_as<uint, 4>(a); }
 
-template<typename _Tp, int n> inline v_int32x4 v_reinterpret_s32(const v_reg<_Tp, n>& a)
+template<typename _Tp, int n> inline v_int32x4 v_reinterpret_as_s32(const v_reg<_Tp, n>& a)
 { return v_reg<_Tp, n>::template reinterpret_as<int, 4>(a); }
 
-template<typename _Tp, int n> inline v_float32x4 v_reinterpret_f32(const v_reg<_Tp, n>& a)
+template<typename _Tp, int n> inline v_float32x4 v_reinterpret_as_f32(const v_reg<_Tp, n>& a)
 { return v_reg<_Tp, n>::template reinterpret_as<float, 4>(a); }
 
-template<typename _Tp, int n> inline v_float64x2 v_reinterpret_f64(const v_reg<_Tp, n>& a)
+template<typename _Tp, int n> inline v_float64x2 v_reinterpret_as_f64(const v_reg<_Tp, n>& a)
 { return v_reg<_Tp, n>::template reinterpret_as<double, 2>(a); }
 
-inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b)
+inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b)
 { return v_cvtsat<ushort, 8, uchar>(a, b); }
-inline v_uint8x16 v_sat_u8(const v_uint16x8& a, const v_uint16x8& b, int n)
+inline v_uint8x16 v_shiftn_u16(const v_uint16x8& a, const v_uint16x8& b, int n)
 { return v_cvtsat<ushort, 8, uchar>(a, b, n); }
-inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b)
+inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b)
 { return v_cvtsat<short, 8, uchar>(a, b); }
-inline v_uint8x16 v_sat_u8(const v_int16x8& a, const v_int16x8& b, int n)
+inline v_uint8x16 v_shiftun_s16(const v_int16x8& a, const v_int16x8& b, int n)
 { return v_cvtsat<short, 8, uchar>(a, b, n); }
 
-inline void v_storesat_u8(uchar* ptr, const v_uint16x8& b)
+inline void v_storen_u16(uchar* ptr, const v_uint16x8& b)
 { return v_storesat(ptr, b); }
-inline void v_storesat_u8(uchar* ptr, const v_uint16x8& b, int n)
+inline void v_shiftstoren_u16(uchar* ptr, const v_uint16x8& b, int n)
 { return v_storesat(ptr, b, n); }
-inline void v_storesat_u8(uchar* ptr, const v_int16x8& b)
+inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& b)
 { return v_storesat(ptr, b); }
-inline void v_storesat_u8(uchar* ptr, const v_int16x8& b, int n)
+inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& b, int n)
 { return v_storesat(ptr, b, n); }
 
-inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b)
-{ return v_cvtsat<ushort, 8, schar>(a, b); }
-inline v_int8x16 v_sat_s8(const v_uint16x8& a, const v_uint16x8& b, int n)
-{ return v_cvtsat<ushort, 8, schar>(a, b, n); }
-inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b)
+inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b)
 { return v_cvtsat<short, 8, schar>(a, b); }
-inline v_int8x16 v_sat_s8(const v_int16x8& a, const v_int16x8& b, int n)
+inline v_int8x16 v_shiftn_s16(const v_int16x8& a, const v_int16x8& b, int n)
 { return v_cvtsat<short, 8, schar>(a, b, n); }
 
-inline void v_storesat_s8(schar* ptr, const v_uint16x8& b)
+inline void v_storen_s16(schar* ptr, const v_int16x8& b)
 { return v_storesat(ptr, b); }
-inline void v_storesat_s8(schar* ptr, const v_uint16x8& b, int n)
-{ return v_storesat(ptr, b, n); }
-inline void v_storesat_s8(schar* ptr, const v_int16x8& b)
-{ return v_storesat(ptr, b); }
-inline void v_storesat_s8(schar* ptr, const v_int16x8& b, int n)
+inline void v_shiftstoren_s16(schar* ptr, const v_int16x8& b, int n)
 { return v_storesat(ptr, b, n); }
 
-inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b)
+inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b)
 { return v_cvtsat<uint, 4, ushort>(a, b); }
-inline v_uint16x8 v_sat_u16(const v_uint32x4& a, const v_uint32x4& b, int n)
+inline v_uint16x8 v_shiftn_u32(const v_uint32x4& a, const v_uint32x4& b, int n)
 { return v_cvtsat<uint, 4, ushort>(a, b, n); }
-inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b)
+inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b)
 { return v_cvtsat<int, 4, ushort>(a, b); }
-inline v_uint16x8 v_sat_u16(const v_int32x4& a, const v_int32x4& b, int n)
+inline v_uint16x8 v_shiftun_s32(const v_int32x4& a, const v_int32x4& b, int n)
 { return v_cvtsat<int, 4, ushort>(a, b, n); }
 
-inline void v_storesat_u16(ushort* ptr, const v_uint32x4& b)
+inline void v_storen_u32(ushort* ptr, const v_uint32x4& b)
 { return v_storesat(ptr, b); }
-inline void v_storesat_u16(ushort* ptr, const v_uint32x4& b, int n)
+inline void v_shiftstoren_u32(ushort* ptr, const v_uint32x4& b, int n)
 { return v_storesat(ptr, b, n); }
-inline void v_storesat_u16(ushort* ptr, const v_int32x4& b)
+inline void v_storeun_s32(ushort* ptr, const v_int32x4& b)
 { return v_storesat(ptr, b); }
-inline void v_storesat_u16(ushort* ptr, const v_int32x4& b, int n)
+inline void v_shiftstoreun_s32(ushort* ptr, const v_int32x4& b, int n)
 { return v_storesat(ptr, b, n); }
 
-inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b)
-{ return v_cvtsat<uint, 4, short>(a, b); }
-inline v_int16x8 v_sat_s16(const v_uint32x4& a, const v_uint32x4& b, int n)
-{ return v_cvtsat<uint, 4, short>(a, b, n); }
-inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b)
+inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b)
 { return v_cvtsat<int, 4, short>(a, b); }
-inline v_int16x8 v_sat_s16(const v_int32x4& a, const v_int32x4& b, int n)
+inline v_int16x8 v_shiftn_s32(const v_int32x4& a, const v_int32x4& b, int n)
 { return v_cvtsat<int, 4, short>(a, b, n); }
 
-inline void v_storesat_s16(short* ptr, const v_uint32x4& b)
+inline void v_storen_s32(short* ptr, const v_int32x4& b)
 { return v_storesat(ptr, b); }
-inline void v_storesat_s16(short* ptr, const v_uint32x4& b, int n)
-{ return v_storesat(ptr, b, n); }
-inline void v_storesat_s16(short* ptr, const v_int32x4& b)
-{ return v_storesat(ptr, b); }
-inline void v_storesat_s16(short* ptr, const v_int32x4& b, int n)
+inline void v_shiftstoren_s32(short* ptr, const v_int32x4& b, int n)
 { return v_storesat(ptr, b, n); }
 
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
diff --git a/modules/hal/src/mathfuncs.cpp b/modules/hal/src/mathfuncs.cpp
index a3f69facc..e970cfedb 100644
--- a/modules/hal/src/mathfuncs.cpp
+++ b/modules/hal/src/mathfuncs.cpp
@@ -44,4 +44,1309 @@
 
 namespace cv { namespace hal {
 
+///////////////////////////////////// ATAN2 ////////////////////////////////////
+static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
+static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
+static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
+static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
+
+#if CV_NEON
+static inline float32x4_t cv_vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+#endif
+
+void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
+{
+    int i = 0;
+    float scale = angleInDegrees ? 1 : (float)(CV_PI/180);
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+    if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale))
+        return;
+#endif
+
+#if CV_SSE2
+    Cv32suf iabsmask; iabsmask.i = 0x7fffffff;
+    __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f);
+    __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f);
+    __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale);
+    __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3);
+    __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7);
+
+    for( ; i <= len - 4; i += 4 )
+    {
+        __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i);
+        __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask);
+        __m128 mask = _mm_cmplt_ps(ax, ay);
+        __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay);
+        __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps));
+        __m128 c2 = _mm_mul_ps(c, c);
+        __m128 a = _mm_mul_ps(c2, p7);
+        a = _mm_mul_ps(_mm_add_ps(a, p5), c2);
+        a = _mm_mul_ps(_mm_add_ps(a, p3), c2);
+        a = _mm_mul_ps(_mm_add_ps(a, p1), c);
+
+        __m128 b = _mm_sub_ps(_90, a);
+        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
+
+        b = _mm_sub_ps(_180, a);
+        mask = _mm_cmplt_ps(x, z);
+        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
+
+        b = _mm_sub_ps(_360, a);
+        mask = _mm_cmplt_ps(y, z);
+        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
+
+        a = _mm_mul_ps(a, scale4);
+        _mm_storeu_ps(angle + i, a);
+    }
+#elif CV_NEON
+    float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON);
+    float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f);
+    float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale);
+    float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3);
+    float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7);
+
+    for( ; i <= len - 4; i += 4 )
+    {
+        float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i);
+        float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y);
+        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);
+        float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps)));
+        float32x4_t c2 = vmulq_f32(c, c);
+        float32x4_t a = vmulq_f32(c2, p7);
+        a = vmulq_f32(vaddq_f32(a, p5), c2);
+        a = vmulq_f32(vaddq_f32(a, p3), c2);
+        a = vmulq_f32(vaddq_f32(a, p1), c);
+
+        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));
+        a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a);
+        a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a);
+
+        vst1q_f32(angle + i, vmulq_f32(a, scale4));
+    }
+#endif
+
+    for( ; i < len; i++ )
+    {
+        float x = X[i], y = Y[i];
+        float ax = std::abs(x), ay = std::abs(y);
+        float a, c, c2;
+        if( ax >= ay )
+        {
+            c = ay/(ax + (float)DBL_EPSILON);
+            c2 = c*c;
+            a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+        }
+        else
+        {
+            c = ax/(ay + (float)DBL_EPSILON);
+            c2 = c*c;
+            a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
+        }
+        if( x < 0 )
+            a = 180.f - a;
+        if( y < 0 )
+            a = 360.f - a;
+        angle[i] = (float)(a*scale);
+    }
+}
+
+
+void magnitude(const float* x, const float* y, float* mag, int len)
+{
+    int i = 0;
+
+#if CV_SIMD128
+    for( ; i <= len - 8; i += 8 )
+    {
+        v_float32x4 x0 = v_load(x + i), x1 = v_load(x + i + 4);
+        v_float32x4 y0 = v_load(y + i), y1 = v_load(y + i + 4);
+        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
+        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
+        v_store(mag + i, x0);
+        v_store(mag + i + 4, x1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+    {
+        float x0 = x[i], y0 = y[i];
+        mag[i] = std::sqrt(x0*x0 + y0*y0);
+    }
+}
+
+void magnitude(const double* x, const double* y, double* mag, int len)
+{
+    int i = 0;
+
+#if defined CV_SIMD128_64F && CV_SIMD128_64F
+    for( ; i <= len - 4; i += 4 )
+    {
+        v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2);
+        v_float64x2 y0 = v_load(y + i), y1 = v_load(y + i + 2);
+        x0 = v_sqrt(v_muladd(x0, x0, y0*y0));
+        x1 = v_sqrt(v_muladd(x1, x1, y1*y1));
+        v_store(mag + i, x0);
+        v_store(mag + i + 2, x1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+    {
+        double x0 = x[i], y0 = y[i];
+        mag[i] = std::sqrt(x0*x0 + y0*y0);
+    }
+}
+
+
+void invSqrt(const float* src, float* dst, int len)
+{
+    int i = 0;
+
+#if CV_SIMD128
+    for( ; i <= len - 8; i += 8 )
+    {
+        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+        t0 = v_invsqrt(t0);
+        t1 = v_invsqrt(t1);
+        v_store(dst + i, t0); v_store(dst + i + 4, t1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+        dst[i] = 1/std::sqrt(src[i]);
+}
+
+
+void invSqrt(const double* src, double* dst, int len)
+{
+    int i = 0;
+
+#if CV_SSE2
+    __m128d v_1 = _mm_set1_pd(1.0);
+    for ( ; i <= len - 2; i += 2)
+        _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
+#endif
+
+    for( ; i < len; i++ )
+        dst[i] = 1/std::sqrt(src[i]);
+}
+
+
+void sqrt(const float* src, float* dst, int len)
+{
+    int i = 0;
+
+#if CV_SIMD128
+    for( ; i <= len - 8; i += 8 )
+    {
+        v_float32x4 t0 = v_load(src + i), t1 = v_load(src + i + 4);
+        t0 = v_sqrt(t0);
+        t1 = v_sqrt(t1);
+        v_store(dst + i, t0); v_store(dst + i + 4, t1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+        dst[i] = std::sqrt(src[i]);
+}
+
+
+void sqrt(const double* src, double* dst, int len)
+{
+    int i = 0;
+
+#if defined CV_SIMD128_64F && CV_SIMD128_64F
+    for( ; i <= len - 4; i += 4 )
+    {
+        v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2);
+        t0 = v_sqrt(t0);
+        t1 = v_sqrt(t1);
+        v_store(dst + i, t0); v_store(dst + i + 2, t1);
+    }
+#endif
+
+    for( ; i < len; i++ )
+        dst[i] = std::sqrt(src[i]);
+}
+
+////////////////////////////////////// EXP /////////////////////////////////////
+
+typedef union
+{
+    struct {
+#if ( defined( WORDS_BIGENDIAN ) && !defined( OPENCV_UNIVERSAL_BUILD ) ) || defined( __BIG_ENDIAN__ )
+        int hi;
+        int lo;
+#else
+        int lo;
+        int hi;
+#endif
+    } i;
+    double d;
+}
+DBLINT;
+
+#define EXPTAB_SCALE 6
+#define EXPTAB_MASK  ((1 << EXPTAB_SCALE) - 1)
+
+#define EXPPOLY_32F_A0 .9670371139572337719125840413672004409288e-2
+
+static const double expTab[] = {
+    1.0 * EXPPOLY_32F_A0,
+    1.0108892860517004600204097905619 * EXPPOLY_32F_A0,
+    1.0218971486541166782344801347833 * EXPPOLY_32F_A0,
+    1.0330248790212284225001082839705 * EXPPOLY_32F_A0,
+    1.0442737824274138403219664787399 * EXPPOLY_32F_A0,
+    1.0556451783605571588083413251529 * EXPPOLY_32F_A0,
+    1.0671404006768236181695211209928 * EXPPOLY_32F_A0,
+    1.0787607977571197937406800374385 * EXPPOLY_32F_A0,
+    1.0905077326652576592070106557607 * EXPPOLY_32F_A0,
+    1.1023825833078409435564142094256 * EXPPOLY_32F_A0,
+    1.1143867425958925363088129569196 * EXPPOLY_32F_A0,
+    1.126521618608241899794798643787 * EXPPOLY_32F_A0,
+    1.1387886347566916537038302838415 * EXPPOLY_32F_A0,
+    1.151189229952982705817759635202 * EXPPOLY_32F_A0,
+    1.1637248587775775138135735990922 * EXPPOLY_32F_A0,
+    1.1763969916502812762846457284838 * EXPPOLY_32F_A0,
+    1.1892071150027210667174999705605 * EXPPOLY_32F_A0,
+    1.2021567314527031420963969574978 * EXPPOLY_32F_A0,
+    1.2152473599804688781165202513388 * EXPPOLY_32F_A0,
+    1.2284805361068700056940089577928 * EXPPOLY_32F_A0,
+    1.2418578120734840485936774687266 * EXPPOLY_32F_A0,
+    1.2553807570246910895793906574423 * EXPPOLY_32F_A0,
+    1.2690509571917332225544190810323 * EXPPOLY_32F_A0,
+    1.2828700160787782807266697810215 * EXPPOLY_32F_A0,
+    1.2968395546510096659337541177925 * EXPPOLY_32F_A0,
+    1.3109612115247643419229917863308 * EXPPOLY_32F_A0,
+    1.3252366431597412946295370954987 * EXPPOLY_32F_A0,
+    1.3396675240533030053600306697244 * EXPPOLY_32F_A0,
+    1.3542555469368927282980147401407 * EXPPOLY_32F_A0,
+    1.3690024229745906119296011329822 * EXPPOLY_32F_A0,
+    1.3839098819638319548726595272652 * EXPPOLY_32F_A0,
+    1.3989796725383111402095281367152 * EXPPOLY_32F_A0,
+    1.4142135623730950488016887242097 * EXPPOLY_32F_A0,
+    1.4296133383919700112350657782751 * EXPPOLY_32F_A0,
+    1.4451808069770466200370062414717 * EXPPOLY_32F_A0,
+    1.4609177941806469886513028903106 * EXPPOLY_32F_A0,
+    1.476826145939499311386907480374 * EXPPOLY_32F_A0,
+    1.4929077282912648492006435314867 * EXPPOLY_32F_A0,
+    1.5091644275934227397660195510332 * EXPPOLY_32F_A0,
+    1.5255981507445383068512536895169 * EXPPOLY_32F_A0,
+    1.5422108254079408236122918620907 * EXPPOLY_32F_A0,
+    1.5590044002378369670337280894749 * EXPPOLY_32F_A0,
+    1.5759808451078864864552701601819 * EXPPOLY_32F_A0,
+    1.5931421513422668979372486431191 * EXPPOLY_32F_A0,
+    1.6104903319492543081795206673574 * EXPPOLY_32F_A0,
+    1.628027421857347766848218522014 * EXPPOLY_32F_A0,
+    1.6457554781539648445187567247258 * EXPPOLY_32F_A0,
+    1.6636765803267364350463364569764 * EXPPOLY_32F_A0,
+    1.6817928305074290860622509524664 * EXPPOLY_32F_A0,
+    1.7001063537185234695013625734975 * EXPPOLY_32F_A0,
+    1.7186192981224779156293443764563 * EXPPOLY_32F_A0,
+    1.7373338352737062489942020818722 * EXPPOLY_32F_A0,
+    1.7562521603732994831121606193753 * EXPPOLY_32F_A0,
+    1.7753764925265212525505592001993 * EXPPOLY_32F_A0,
+    1.7947090750031071864277032421278 * EXPPOLY_32F_A0,
+    1.8142521755003987562498346003623 * EXPPOLY_32F_A0,
+    1.8340080864093424634870831895883 * EXPPOLY_32F_A0,
+    1.8539791250833855683924530703377 * EXPPOLY_32F_A0,
+    1.8741676341102999013299989499544 * EXPPOLY_32F_A0,
+    1.8945759815869656413402186534269 * EXPPOLY_32F_A0,
+    1.9152065613971472938726112702958 * EXPPOLY_32F_A0,
+    1.9360617934922944505980559045667 * EXPPOLY_32F_A0,
+    1.9571441241754002690183222516269 * EXPPOLY_32F_A0,
+    1.9784560263879509682582499181312 * EXPPOLY_32F_A0,
+};
+
+
+// the code below uses _mm_cast* intrinsics, which are not avialable on VS2005
+#if (defined _MSC_VER && _MSC_VER < 1500) || \
+(!defined __APPLE__ && defined __GNUC__ && __GNUC__*100 + __GNUC_MINOR__ < 402)
+#undef CV_SSE2
+#define CV_SSE2 0
+#endif
+
+static const double exp_prescale = 1.4426950408889634073599246810019 * (1 << EXPTAB_SCALE);
+static const double exp_postscale = 1./(1 << EXPTAB_SCALE);
+static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < 3000
+
+void exp( const float *_x, float *y, int n )
+{
+    static const float
+    A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
+    A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
+    A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
+    A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);
+
+#undef EXPPOLY
+#define EXPPOLY(x)  \
+(((((x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)
+
+    int i = 0;
+    const Cv32suf* x = (const Cv32suf*)_x;
+    Cv32suf buf[4];
+
+#if CV_SSE2
+    if( n >= 8 )
+    {
+        static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
+        static const __m128 postscale4 = _mm_set1_ps((float)exp_postscale);
+        static const __m128 maxval4 = _mm_set1_ps((float)(exp_max_val/exp_prescale));
+        static const __m128 minval4 = _mm_set1_ps((float)(-exp_max_val/exp_prescale));
+
+        static const __m128 mA1 = _mm_set1_ps(A1);
+        static const __m128 mA2 = _mm_set1_ps(A2);
+        static const __m128 mA3 = _mm_set1_ps(A3);
+        static const __m128 mA4 = _mm_set1_ps(A4);
+        bool y_aligned = (size_t)(void*)y % 16 == 0;
+
+        ushort CV_DECL_ALIGNED(16) tab_idx[8];
+
+        for( ; i <= n - 8; i += 8 )
+        {
+            __m128 xf0, xf1;
+            xf0 = _mm_loadu_ps(&x[i].f);
+            xf1 = _mm_loadu_ps(&x[i+4].f);
+            __m128i xi0, xi1, xi2, xi3;
+
+            xf0 = _mm_min_ps(_mm_max_ps(xf0, minval4), maxval4);
+            xf1 = _mm_min_ps(_mm_max_ps(xf1, minval4), maxval4);
+
+            __m128d xd0 = _mm_cvtps_pd(xf0);
+            __m128d xd2 = _mm_cvtps_pd(_mm_movehl_ps(xf0, xf0));
+            __m128d xd1 = _mm_cvtps_pd(xf1);
+            __m128d xd3 = _mm_cvtps_pd(_mm_movehl_ps(xf1, xf1));
+
+            xd0 = _mm_mul_pd(xd0, prescale2);
+            xd2 = _mm_mul_pd(xd2, prescale2);
+            xd1 = _mm_mul_pd(xd1, prescale2);
+            xd3 = _mm_mul_pd(xd3, prescale2);
+
+            xi0 = _mm_cvtpd_epi32(xd0);
+            xi2 = _mm_cvtpd_epi32(xd2);
+
+            xi1 = _mm_cvtpd_epi32(xd1);
+            xi3 = _mm_cvtpd_epi32(xd3);
+
+            xd0 = _mm_sub_pd(xd0, _mm_cvtepi32_pd(xi0));
+            xd2 = _mm_sub_pd(xd2, _mm_cvtepi32_pd(xi2));
+            xd1 = _mm_sub_pd(xd1, _mm_cvtepi32_pd(xi1));
+            xd3 = _mm_sub_pd(xd3, _mm_cvtepi32_pd(xi3));
+
+            xf0 = _mm_movelh_ps(_mm_cvtpd_ps(xd0), _mm_cvtpd_ps(xd2));
+            xf1 = _mm_movelh_ps(_mm_cvtpd_ps(xd1), _mm_cvtpd_ps(xd3));
+
+            xf0 = _mm_mul_ps(xf0, postscale4);
+            xf1 = _mm_mul_ps(xf1, postscale4);
+
+            xi0 = _mm_unpacklo_epi64(xi0, xi2);
+            xi1 = _mm_unpacklo_epi64(xi1, xi3);
+            xi0 = _mm_packs_epi32(xi0, xi1);
+
+            _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi16(EXPTAB_MASK)));
+
+            xi0 = _mm_add_epi16(_mm_srai_epi16(xi0, EXPTAB_SCALE), _mm_set1_epi16(127));
+            xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
+            xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(255));
+            xi1 = _mm_unpackhi_epi16(xi0, _mm_setzero_si128());
+            xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
+
+            __m128d yd0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
+            __m128d yd1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
+            __m128d yd2 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[4]), _mm_load_sd(expTab + tab_idx[5]));
+            __m128d yd3 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[6]), _mm_load_sd(expTab + tab_idx[7]));
+
+            __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
+            __m128 yf1 = _mm_movelh_ps(_mm_cvtpd_ps(yd2), _mm_cvtpd_ps(yd3));
+
+            yf0 = _mm_mul_ps(yf0, _mm_castsi128_ps(_mm_slli_epi32(xi0, 23)));
+            yf1 = _mm_mul_ps(yf1, _mm_castsi128_ps(_mm_slli_epi32(xi1, 23)));
+
+            __m128 zf0 = _mm_add_ps(xf0, mA1);
+            __m128 zf1 = _mm_add_ps(xf1, mA1);
+
+            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA2);
+            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA2);
+
+            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA3);
+            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA3);
+
+            zf0 = _mm_add_ps(_mm_mul_ps(zf0, xf0), mA4);
+            zf1 = _mm_add_ps(_mm_mul_ps(zf1, xf1), mA4);
+
+            zf0 = _mm_mul_ps(zf0, yf0);
+            zf1 = _mm_mul_ps(zf1, yf1);
+
+            if( y_aligned )
+            {
+                _mm_store_ps(y + i, zf0);
+                _mm_store_ps(y + i + 4, zf1);
+            }
+            else
+            {
+                _mm_storeu_ps(y + i, zf0);
+                _mm_storeu_ps(y + i + 4, zf1);
+            }
+        }
+    }
+    else
+#endif
+        for( ; i <= n - 4; i += 4 )
+        {
+            double x0 = x[i].f * exp_prescale;
+            double x1 = x[i + 1].f * exp_prescale;
+            double x2 = x[i + 2].f * exp_prescale;
+            double x3 = x[i + 3].f * exp_prescale;
+            int val0, val1, val2, val3, t;
+
+            if( ((x[i].i >> 23) & 255) > 127 + 10 )
+                x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
+
+            if( ((x[i+1].i >> 23) & 255) > 127 + 10 )
+                x1 = x[i+1].i < 0 ? -exp_max_val : exp_max_val;
+
+            if( ((x[i+2].i >> 23) & 255) > 127 + 10 )
+                x2 = x[i+2].i < 0 ? -exp_max_val : exp_max_val;
+
+            if( ((x[i+3].i >> 23) & 255) > 127 + 10 )
+                x3 = x[i+3].i < 0 ? -exp_max_val : exp_max_val;
+
+            val0 = cvRound(x0);
+            val1 = cvRound(x1);
+            val2 = cvRound(x2);
+            val3 = cvRound(x3);
+
+            x0 = (x0 - val0)*exp_postscale;
+            x1 = (x1 - val1)*exp_postscale;
+            x2 = (x2 - val2)*exp_postscale;
+            x3 = (x3 - val3)*exp_postscale;
+
+            t = (val0 >> EXPTAB_SCALE) + 127;
+            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+            buf[0].i = t << 23;
+
+            t = (val1 >> EXPTAB_SCALE) + 127;
+            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+            buf[1].i = t << 23;
+
+            t = (val2 >> EXPTAB_SCALE) + 127;
+            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+            buf[2].i = t << 23;
+
+            t = (val3 >> EXPTAB_SCALE) + 127;
+            t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+            buf[3].i = t << 23;
+
+            x0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+            x1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
+
+            y[i] = (float)x0;
+            y[i + 1] = (float)x1;
+
+            x2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
+            x3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
+
+            y[i + 2] = (float)x2;
+            y[i + 3] = (float)x3;
+        }
+
+    for( ; i < n; i++ )
+    {
+        double x0 = x[i].f * exp_prescale;
+        int val0, t;
+
+        if( ((x[i].i >> 23) & 255) > 127 + 10 )
+            x0 = x[i].i < 0 ? -exp_max_val : exp_max_val;
+
+        val0 = cvRound(x0);
+        t = (val0 >> EXPTAB_SCALE) + 127;
+        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
+
+        buf[0].i = t << 23;
+        x0 = (x0 - val0)*exp_postscale;
+
+        y[i] = (float)(buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY(x0));
+    }
+}
+
+void exp( const double *_x, double *y, int n )
+{
+    static const double
+    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
+    A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
+    A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0,
+    A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0,
+    A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0,
+    A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0;
+
+#undef EXPPOLY
+#define EXPPOLY(x)  (((((A0*(x) + A1)*(x) + A2)*(x) + A3)*(x) + A4)*(x) + A5)
+
+    int i = 0;
+    Cv64suf buf[4];
+    const Cv64suf* x = (const Cv64suf*)_x;
+
+#if CV_SSE2
+    static const __m128d prescale2 = _mm_set1_pd(exp_prescale);
+    static const __m128d postscale2 = _mm_set1_pd(exp_postscale);
+    static const __m128d maxval2 = _mm_set1_pd(exp_max_val);
+    static const __m128d minval2 = _mm_set1_pd(-exp_max_val);
+
+    static const __m128d mA0 = _mm_set1_pd(A0);
+    static const __m128d mA1 = _mm_set1_pd(A1);
+    static const __m128d mA2 = _mm_set1_pd(A2);
+    static const __m128d mA3 = _mm_set1_pd(A3);
+    static const __m128d mA4 = _mm_set1_pd(A4);
+    static const __m128d mA5 = _mm_set1_pd(A5);
+
+    int CV_DECL_ALIGNED(16) tab_idx[4];
+
+    for( ; i <= n - 4; i += 4 )
+    {
+        __m128d xf0 = _mm_loadu_pd(&x[i].f), xf1 = _mm_loadu_pd(&x[i+2].f);
+        __m128i xi0, xi1;
+        xf0 = _mm_min_pd(_mm_max_pd(xf0, minval2), maxval2);
+        xf1 = _mm_min_pd(_mm_max_pd(xf1, minval2), maxval2);
+        xf0 = _mm_mul_pd(xf0, prescale2);
+        xf1 = _mm_mul_pd(xf1, prescale2);
+
+        xi0 = _mm_cvtpd_epi32(xf0);
+        xi1 = _mm_cvtpd_epi32(xf1);
+        xf0 = _mm_mul_pd(_mm_sub_pd(xf0, _mm_cvtepi32_pd(xi0)), postscale2);
+        xf1 = _mm_mul_pd(_mm_sub_pd(xf1, _mm_cvtepi32_pd(xi1)), postscale2);
+
+        xi0 = _mm_unpacklo_epi64(xi0, xi1);
+        _mm_store_si128((__m128i*)tab_idx, _mm_and_si128(xi0, _mm_set1_epi32(EXPTAB_MASK)));
+
+        xi0 = _mm_add_epi32(_mm_srai_epi32(xi0, EXPTAB_SCALE), _mm_set1_epi32(1023));
+        xi0 = _mm_packs_epi32(xi0, xi0);
+        xi0 = _mm_max_epi16(xi0, _mm_setzero_si128());
+        xi0 = _mm_min_epi16(xi0, _mm_set1_epi16(2047));
+        xi0 = _mm_unpacklo_epi16(xi0, _mm_setzero_si128());
+        xi1 = _mm_unpackhi_epi32(xi0, _mm_setzero_si128());
+        xi0 = _mm_unpacklo_epi32(xi0, _mm_setzero_si128());
+
+        __m128d yf0 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[0]), _mm_load_sd(expTab + tab_idx[1]));
+        __m128d yf1 = _mm_unpacklo_pd(_mm_load_sd(expTab + tab_idx[2]), _mm_load_sd(expTab + tab_idx[3]));
+        yf0 = _mm_mul_pd(yf0, _mm_castsi128_pd(_mm_slli_epi64(xi0, 52)));
+        yf1 = _mm_mul_pd(yf1, _mm_castsi128_pd(_mm_slli_epi64(xi1, 52)));
+
+        __m128d zf0 = _mm_add_pd(_mm_mul_pd(mA0, xf0), mA1);
+        __m128d zf1 = _mm_add_pd(_mm_mul_pd(mA0, xf1), mA1);
+
+        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA2);
+        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA2);
+
+        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA3);
+        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA3);
+
+        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA4);
+        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA4);
+
+        zf0 = _mm_add_pd(_mm_mul_pd(zf0, xf0), mA5);
+        zf1 = _mm_add_pd(_mm_mul_pd(zf1, xf1), mA5);
+
+        zf0 = _mm_mul_pd(zf0, yf0);
+        zf1 = _mm_mul_pd(zf1, yf1);
+
+        _mm_storeu_pd(y + i, zf0);
+        _mm_storeu_pd(y + i + 2, zf1);
+    }
+#endif
+    for( ; i <= n - 4; i += 4 )
+    {
+        double x0 = x[i].f * exp_prescale;
+        double x1 = x[i + 1].f * exp_prescale;
+        double x2 = x[i + 2].f * exp_prescale;
+        double x3 = x[i + 3].f * exp_prescale;
+
+        double y0, y1, y2, y3;
+        int val0, val1, val2, val3, t;
+
+        t = (int)(x[i].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x0 = t < 0 ? -exp_max_val : exp_max_val;
+
+        t = (int)(x[i+1].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x1 = t < 0 ? -exp_max_val : exp_max_val;
+
+        t = (int)(x[i+2].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x2 = t < 0 ? -exp_max_val : exp_max_val;
+
+        t = (int)(x[i+3].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x3 = t < 0 ? -exp_max_val : exp_max_val;
+
+        val0 = cvRound(x0);
+        val1 = cvRound(x1);
+        val2 = cvRound(x2);
+        val3 = cvRound(x3);
+
+        x0 = (x0 - val0)*exp_postscale;
+        x1 = (x1 - val1)*exp_postscale;
+        x2 = (x2 - val2)*exp_postscale;
+        x3 = (x3 - val3)*exp_postscale;
+
+        t = (val0 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf[0].i = (int64)t << 52;
+
+        t = (val1 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf[1].i = (int64)t << 52;
+
+        t = (val2 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf[2].i = (int64)t << 52;
+
+        t = (val3 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+        buf[3].i = (int64)t << 52;
+
+        y0 = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+        y1 = buf[1].f * expTab[val1 & EXPTAB_MASK] * EXPPOLY( x1 );
+
+        y[i] = y0;
+        y[i + 1] = y1;
+
+        y2 = buf[2].f * expTab[val2 & EXPTAB_MASK] * EXPPOLY( x2 );
+        y3 = buf[3].f * expTab[val3 & EXPTAB_MASK] * EXPPOLY( x3 );
+
+        y[i + 2] = y2;
+        y[i + 3] = y3;
+    }
+
+    for( ; i < n; i++ )
+    {
+        double x0 = x[i].f * exp_prescale;
+        int val0, t;
+
+        t = (int)(x[i].i >> 52);
+        if( (t & 2047) > 1023 + 10 )
+            x0 = t < 0 ? -exp_max_val : exp_max_val;
+
+        val0 = cvRound(x0);
+        t = (val0 >> EXPTAB_SCALE) + 1023;
+        t = !(t & ~2047) ? t : t < 0 ? 0 : 2047;
+
+        buf[0].i = (int64)t << 52;
+        x0 = (x0 - val0)*exp_postscale;
+
+        y[i] = buf[0].f * expTab[val0 & EXPTAB_MASK] * EXPPOLY( x0 );
+    }
+}
+
+#undef EXPTAB_SCALE
+#undef EXPTAB_MASK
+#undef EXPPOLY_32F_A0
+
+/////////////////////////////////////////// LOG ///////////////////////////////////////
+
+#define LOGTAB_SCALE    8
+#define LOGTAB_MASK         ((1 << LOGTAB_SCALE) - 1)
+#define LOGTAB_MASK2        ((1 << (20 - LOGTAB_SCALE)) - 1)
+#define LOGTAB_MASK2_32F    ((1 << (23 - LOGTAB_SCALE)) - 1)
+
+static const double CV_DECL_ALIGNED(16) icvLogTab[] = {
+    0.0000000000000000000000000000000000000000,    1.000000000000000000000000000000000000000,
+    .00389864041565732288852075271279318258166,    .9961089494163424124513618677042801556420,
+    .00778214044205494809292034119607706088573,    .9922480620155038759689922480620155038760,
+    .01165061721997527263705585198749759001657,    .9884169884169884169884169884169884169884,
+    .01550418653596525274396267235488267033361,    .9846153846153846153846153846153846153846,
+    .01934296284313093139406447562578250654042,    .9808429118773946360153256704980842911877,
+    .02316705928153437593630670221500622574241,    .9770992366412213740458015267175572519084,
+    .02697658769820207233514075539915211265906,    .9733840304182509505703422053231939163498,
+    .03077165866675368732785500469617545604706,    .9696969696969696969696969696969696969697,
+    .03455238150665972812758397481047722976656,    .9660377358490566037735849056603773584906,
+    .03831886430213659461285757856785494368522,    .9624060150375939849624060150375939849624,
+    .04207121392068705056921373852674150839447,    .9588014981273408239700374531835205992509,
+    .04580953603129420126371940114040626212953,    .9552238805970149253731343283582089552239,
+    .04953393512227662748292900118940451648088,    .9516728624535315985130111524163568773234,
+    .05324451451881227759255210685296333394944,    .9481481481481481481481481481481481481481,
+    .05694137640013842427411105973078520037234,    .9446494464944649446494464944649446494465,
+    .06062462181643483993820353816772694699466,    .9411764705882352941176470588235294117647,
+    .06429435070539725460836422143984236754475,    .9377289377289377289377289377289377289377,
+    .06795066190850773679699159401934593915938,    .9343065693430656934306569343065693430657,
+    .07159365318700880442825962290953611955044,    .9309090909090909090909090909090909090909,
+    .07522342123758751775142172846244648098944,    .9275362318840579710144927536231884057971,
+    .07884006170777602129362549021607264876369,    .9241877256317689530685920577617328519856,
+    .08244366921107458556772229485432035289706,    .9208633093525179856115107913669064748201,
+    .08603433734180314373940490213499288074675,    .9175627240143369175627240143369175627240,
+    .08961215868968712416897659522874164395031,    .9142857142857142857142857142857142857143,
+    .09317722485418328259854092721070628613231,    .9110320284697508896797153024911032028470,
+    .09672962645855109897752299730200320482256,    .9078014184397163120567375886524822695035,
+    .10026945316367513738597949668474029749630,    .9045936395759717314487632508833922261484,
+    .10379679368164355934833764649738441221420,    .9014084507042253521126760563380281690141,
+    .10731173578908805021914218968959175981580,    .8982456140350877192982456140350877192982,
+    .11081436634029011301105782649756292812530,    .8951048951048951048951048951048951048951,
+    .11430477128005862852422325204315711744130,    .8919860627177700348432055749128919860627,
+    .11778303565638344185817487641543266363440,    .8888888888888888888888888888888888888889,
+    .12124924363286967987640707633545389398930,    .8858131487889273356401384083044982698962,
+    .12470347850095722663787967121606925502420,    .8827586206896551724137931034482758620690,
+    .12814582269193003360996385708858724683530,    .8797250859106529209621993127147766323024,
+    .13157635778871926146571524895989568904040,    .8767123287671232876712328767123287671233,
+    .13499516453750481925766280255629681050780,    .8737201365187713310580204778156996587031,
+    .13840232285911913123754857224412262439730,    .8707482993197278911564625850340136054422,
+    .14179791186025733629172407290752744302150,    .8677966101694915254237288135593220338983,
+    .14518200984449788903951628071808954700830,    .8648648648648648648648648648648648648649,
+    .14855469432313711530824207329715136438610,    .8619528619528619528619528619528619528620,
+    .15191604202584196858794030049466527998450,    .8590604026845637583892617449664429530201,
+    .15526612891112392955683674244937719777230,    .8561872909698996655518394648829431438127,
+    .15860503017663857283636730244325008243330,    .8533333333333333333333333333333333333333,
+    .16193282026931324346641360989451641216880,    .8504983388704318936877076411960132890365,
+    .16524957289530714521497145597095368430010,    .8476821192052980132450331125827814569536,
+    .16855536102980664403538924034364754334090,    .8448844884488448844884488448844884488449,
+    .17185025692665920060697715143760433420540,    .8421052631578947368421052631578947368421,
+    .17513433212784912385018287750426679849630,    .8393442622950819672131147540983606557377,
+    .17840765747281828179637841458315961062910,    .8366013071895424836601307189542483660131,
+    .18167030310763465639212199675966985523700,    .8338762214983713355048859934853420195440,
+    .18492233849401198964024217730184318497780,    .8311688311688311688311688311688311688312,
+    .18816383241818296356839823602058459073300,    .8284789644012944983818770226537216828479,
+    .19139485299962943898322009772527962923050,    .8258064516129032258064516129032258064516,
+    .19461546769967164038916962454095482826240,    .8231511254019292604501607717041800643087,
+    .19782574332991986754137769821682013571260,    .8205128205128205128205128205128205128205,
+    .20102574606059073203390141770796617493040,    .8178913738019169329073482428115015974441,
+    .20421554142869088876999228432396193966280,    .8152866242038216560509554140127388535032,
+    .20739519434607056602715147164417430758480,    .8126984126984126984126984126984126984127,
+    .21056476910734961416338251183333341032260,    .8101265822784810126582278481012658227848,
+    .21372432939771812687723695489694364368910,    .8075709779179810725552050473186119873817,
+    .21687393830061435506806333251006435602900,    .8050314465408805031446540880503144654088,
+    .22001365830528207823135744547471404075630,    .8025078369905956112852664576802507836991,
+    .22314355131420973710199007200571941211830,    .8000000000000000000000000000000000000000,
+    .22626367865045338145790765338460914790630,    .7975077881619937694704049844236760124611,
+    .22937410106484582006380890106811420992010,    .7950310559006211180124223602484472049689,
+    .23247487874309405442296849741978803649550,    .7925696594427244582043343653250773993808,
+    .23556607131276688371634975283086532726890,    .7901234567901234567901234567901234567901,
+    .23864773785017498464178231643018079921600,    .7876923076923076923076923076923076923077,
+    .24171993688714515924331749374687206000090,    .7852760736196319018404907975460122699387,
+    .24478272641769091566565919038112042471760,    .7828746177370030581039755351681957186544,
+    .24783616390458124145723672882013488560910,    .7804878048780487804878048780487804878049,
+    .25088030628580937353433455427875742316250,    .7781155015197568389057750759878419452888,
+    .25391520998096339667426946107298135757450,    .7757575757575757575757575757575757575758,
+    .25694093089750041913887912414793390780680,    .7734138972809667673716012084592145015106,
+    .25995752443692604627401010475296061486000,    .7710843373493975903614457831325301204819,
+    .26296504550088134477547896494797896593800,    .7687687687687687687687687687687687687688,
+    .26596354849713793599974565040611196309330,    .7664670658682634730538922155688622754491,
+    .26895308734550393836570947314612567424780,    .7641791044776119402985074626865671641791,
+    .27193371548364175804834985683555714786050,    .7619047619047619047619047619047619047619,
+    .27490548587279922676529508862586226314300,    .7596439169139465875370919881305637982196,
+    .27786845100345625159121709657483734190480,    .7573964497041420118343195266272189349112,
+    .28082266290088775395616949026589281857030,    .7551622418879056047197640117994100294985,
+    .28376817313064456316240580235898960381750,    .7529411764705882352941176470588235294118,
+    .28670503280395426282112225635501090437180,    .7507331378299120234604105571847507331378,
+    .28963329258304265634293983566749375313530,    .7485380116959064327485380116959064327485,
+    .29255300268637740579436012922087684273730,    .7463556851311953352769679300291545189504,
+    .29546421289383584252163927885703742504130,    .7441860465116279069767441860465116279070,
+    .29836697255179722709783618483925238251680,    .7420289855072463768115942028985507246377,
+    .30126133057816173455023545102449133992200,    .7398843930635838150289017341040462427746,
+    .30414733546729666446850615102448500692850,    .7377521613832853025936599423631123919308,
+    .30702503529491181888388950937951449304830,    .7356321839080459770114942528735632183908,
+    .30989447772286465854207904158101882785550,    .7335243553008595988538681948424068767908,
+    .31275571000389684739317885942000430077330,    .7314285714285714285714285714285714285714,
+    .31560877898630329552176476681779604405180,    .7293447293447293447293447293447293447293,
+    .31845373111853458869546784626436419785030,    .7272727272727272727272727272727272727273,
+    .32129061245373424782201254856772720813750,    .7252124645892351274787535410764872521246,
+    .32411946865421192853773391107097268104550,    .7231638418079096045197740112994350282486,
+    .32694034499585328257253991068864706903700,    .7211267605633802816901408450704225352113,
+    .32975328637246797969240219572384376078850,    .7191011235955056179775280898876404494382,
+    .33255833730007655635318997155991382896900,    .7170868347338935574229691876750700280112,
+    .33535554192113781191153520921943709254280,    .7150837988826815642458100558659217877095,
+    .33814494400871636381467055798566434532400,    .7130919220055710306406685236768802228412,
+    .34092658697059319283795275623560883104800,    .7111111111111111111111111111111111111111,
+    .34370051385331840121395430287520866841080,    .7091412742382271468144044321329639889197,
+    .34646676734620857063262633346312213689100,    .7071823204419889502762430939226519337017,
+    .34922538978528827602332285096053965389730,    .7052341597796143250688705234159779614325,
+    .35197642315717814209818925519357435405250,    .7032967032967032967032967032967032967033,
+    .35471990910292899856770532096561510115850,    .7013698630136986301369863013698630136986,
+    .35745588892180374385176833129662554711100,    .6994535519125683060109289617486338797814,
+    .36018440357500774995358483465679455548530,    .6975476839237057220708446866485013623978,
+    .36290549368936841911903457003063522279280,    .6956521739130434782608695652173913043478,
+    .36561919956096466943762379742111079394830,    .6937669376693766937669376693766937669377,
+    .36832556115870762614150635272380895912650,    .6918918918918918918918918918918918918919,
+    .37102461812787262962487488948681857436900,    .6900269541778975741239892183288409703504,
+    .37371640979358405898480555151763837784530,    .6881720430107526881720430107526881720430,
+    .37640097516425302659470730759494472295050,    .6863270777479892761394101876675603217158,
+    .37907835293496944251145919224654790014030,    .6844919786096256684491978609625668449198,
+    .38174858149084833769393299007788300514230,    .6826666666666666666666666666666666666667,
+    .38441169891033200034513583887019194662580,    .6808510638297872340425531914893617021277,
+    .38706774296844825844488013899535872042180,    .6790450928381962864721485411140583554377,
+    .38971675114002518602873692543653305619950,    .6772486772486772486772486772486772486772,
+    .39235876060286384303665840889152605086580,    .6754617414248021108179419525065963060686,
+    .39499380824086893770896722344332374632350,    .6736842105263157894736842105263157894737,
+    .39762193064713846624158577469643205404280,    .6719160104986876640419947506561679790026,
+    .40024316412701266276741307592601515352730,    .6701570680628272251308900523560209424084,
+    .40285754470108348090917615991202183067800,    .6684073107049608355091383812010443864230,
+    .40546510810816432934799991016916465014230,    .6666666666666666666666666666666666666667,
+    .40806588980822172674223224930756259709600,    .6649350649350649350649350649350649350649,
+    .41065992498526837639616360320360399782650,    .6632124352331606217616580310880829015544,
+    .41324724855021932601317757871584035456180,    .6614987080103359173126614987080103359173,
+    .41582789514371093497757669865677598863850,    .6597938144329896907216494845360824742268,
+    .41840189913888381489925905043492093682300,    .6580976863753213367609254498714652956298,
+    .42096929464412963239894338585145305842150,    .6564102564102564102564102564102564102564,
+    .42353011550580327293502591601281892508280,    .6547314578005115089514066496163682864450,
+    .42608439531090003260516141381231136620050,    .6530612244897959183673469387755102040816,
+    .42863216738969872610098832410585600882780,    .6513994910941475826972010178117048346056,
+    .43117346481837132143866142541810404509300,    .6497461928934010152284263959390862944162,
+    .43370832042155937902094819946796633303180,    .6481012658227848101265822784810126582278,
+    .43623676677491801667585491486534010618930,    .6464646464646464646464646464646464646465,
+    .43875883620762790027214350629947148263450,    .6448362720403022670025188916876574307305,
+    .44127456080487520440058801796112675219780,    .6432160804020100502512562814070351758794,
+    .44378397241030093089975139264424797147500,    .6416040100250626566416040100250626566416,
+    .44628710262841947420398014401143882423650,    .6400000000000000000000000000000000000000,
+    .44878398282700665555822183705458883196130,    .6384039900249376558603491271820448877805,
+    .45127464413945855836729492693848442286250,    .6368159203980099502487562189054726368159,
+    .45375911746712049854579618113348260521900,    .6352357320099255583126550868486352357320,
+    .45623743348158757315857769754074979573500,    .6336633663366336633663366336633663366337,
+    .45870962262697662081833982483658473938700,    .6320987654320987654320987654320987654321,
+    .46117571512217014895185229761409573256980,    .6305418719211822660098522167487684729064,
+    .46363574096303250549055974261136725544930,    .6289926289926289926289926289926289926290,
+    .46608972992459918316399125615134835243230,    .6274509803921568627450980392156862745098,
+    .46853771156323925639597405279346276074650,    .6259168704156479217603911980440097799511,
+    .47097971521879100631480241645476780831830,    .6243902439024390243902439024390243902439,
+    .47341577001667212165614273544633761048330,    .6228710462287104622871046228710462287105,
+    .47584590486996386493601107758877333253630,    .6213592233009708737864077669902912621359,
+    .47827014848147025860569669930555392056700,    .6198547215496368038740920096852300242131,
+    .48068852934575190261057286988943815231330,    .6183574879227053140096618357487922705314,
+    .48310107575113581113157579238759353756900,    .6168674698795180722891566265060240963855,
+    .48550781578170076890899053978500887751580,    .6153846153846153846153846153846153846154,
+    .48790877731923892879351001283794175833480,    .6139088729016786570743405275779376498801,
+    .49030398804519381705802061333088204264650,    .6124401913875598086124401913875598086124,
+    .49269347544257524607047571407747454941280,    .6109785202863961813842482100238663484487,
+    .49507726679785146739476431321236304938800,    .6095238095238095238095238095238095238095,
+    .49745538920281889838648226032091770321130,    .6080760095011876484560570071258907363420,
+    .49982786955644931126130359189119189977650,    .6066350710900473933649289099526066350711,
+    .50219473456671548383667413872899487614650,    .6052009456264775413711583924349881796690,
+    .50455601075239520092452494282042607665050,    .6037735849056603773584905660377358490566,
+    .50691172444485432801997148999362252652650,    .6023529411764705882352941176470588235294,
+    .50926190178980790257412536448100581765150,    .6009389671361502347417840375586854460094,
+    .51160656874906207391973111953120678663250,    .5995316159250585480093676814988290398126,
+    .51394575110223428282552049495279788970950,    .5981308411214953271028037383177570093458,
+    .51627947444845445623684554448118433356300,    .5967365967365967365967365967365967365967,
+    .51860776420804555186805373523384332656850,    .5953488372093023255813953488372093023256,
+    .52093064562418522900344441950437612831600,    .5939675174013921113689095127610208816705,
+    .52324814376454775732838697877014055848100,    .5925925925925925925925925925925925925926,
+    .52556028352292727401362526507000438869000,    .5912240184757505773672055427251732101617,
+    .52786708962084227803046587723656557500350,    .5898617511520737327188940092165898617512,
+    .53016858660912158374145519701414741575700,    .5885057471264367816091954022988505747126,
+    .53246479886947173376654518506256863474850,    .5871559633027522935779816513761467889908,
+    .53475575061602764748158733709715306758900,    .5858123569794050343249427917620137299771,
+    .53704146589688361856929077475797384977350,    .5844748858447488584474885844748858447489,
+    .53932196859560876944783558428753167390800,    .5831435079726651480637813211845102505695,
+    .54159728243274429804188230264117009937750,    .5818181818181818181818181818181818181818,
+    .54386743096728351609669971367111429572100,    .5804988662131519274376417233560090702948,
+    .54613243759813556721383065450936555862450,    .5791855203619909502262443438914027149321,
+    .54839232556557315767520321969641372561450,    .5778781038374717832957110609480812641084,
+    .55064711795266219063194057525834068655950,    .5765765765765765765765765765765765765766,
+    .55289683768667763352766542084282264113450,    .5752808988764044943820224719101123595506,
+    .55514150754050151093110798683483153581600,    .5739910313901345291479820627802690582960,
+    .55738115013400635344709144192165695130850,    .5727069351230425055928411633109619686801,
+    .55961578793542265941596269840374588966350,    .5714285714285714285714285714285714285714,
+    .56184544326269181269140062795486301183700,    .5701559020044543429844097995545657015590,
+    .56407013828480290218436721261241473257550,    .5688888888888888888888888888888888888889,
+    .56628989502311577464155334382667206227800,    .5676274944567627494456762749445676274945,
+    .56850473535266865532378233183408156037350,    .5663716814159292035398230088495575221239,
+    .57071468100347144680739575051120482385150,    .5651214128035320088300220750551876379691,
+    .57291975356178548306473885531886480748650,    .5638766519823788546255506607929515418502,
+    .57511997447138785144460371157038025558000,    .5626373626373626373626373626373626373626,
+    .57731536503482350219940144597785547375700,    .5614035087719298245614035087719298245614,
+    .57950594641464214795689713355386629700650,    .5601750547045951859956236323851203501094,
+    .58169173963462239562716149521293118596100,    .5589519650655021834061135371179039301310,
+    .58387276558098266665552955601015128195300,    .5577342047930283224400871459694989106754,
+    .58604904500357812846544902640744112432000,    .5565217391304347826086956521739130434783,
+    .58822059851708596855957011939608491957200,    .5553145336225596529284164859002169197397,
+    .59038744660217634674381770309992134571100,    .5541125541125541125541125541125541125541,
+    .59254960960667157898740242671919986605650,    .5529157667386609071274298056155507559395,
+    .59470710774669277576265358220553025603300,    .5517241379310344827586206896551724137931,
+    .59685996110779382384237123915227130055450,    .5505376344086021505376344086021505376344,
+    .59900818964608337768851242799428291618800,    .5493562231759656652360515021459227467811,
+    .60115181318933474940990890900138765573500,    .5481798715203426124197002141327623126338,
+    .60329085143808425240052883964381180703650,    .5470085470085470085470085470085470085470,
+    .60542532396671688843525771517306566238400,    .5458422174840085287846481876332622601279,
+    .60755525022454170969155029524699784815300,    .5446808510638297872340425531914893617021,
+    .60968064953685519036241657886421307921400,    .5435244161358811040339702760084925690021,
+    .61180154110599282990534675263916142284850,    .5423728813559322033898305084745762711864,
+    .61391794401237043121710712512140162289150,    .5412262156448202959830866807610993657505,
+    .61602987721551394351138242200249806046500,    .5400843881856540084388185654008438818565,
+    .61813735955507864705538167982012964785100,    .5389473684210526315789473684210526315789,
+    .62024040975185745772080281312810257077200,    .5378151260504201680672268907563025210084,
+    .62233904640877868441606324267922900617100,    .5366876310272536687631027253668763102725,
+    .62443328801189346144440150965237990021700,    .5355648535564853556485355648535564853556,
+    .62652315293135274476554741340805776417250,    .5344467640918580375782881002087682672234,
+    .62860865942237409420556559780379757285100,    .5333333333333333333333333333333333333333,
+    .63068982562619868570408243613201193511500,    .5322245322245322245322245322245322245322,
+    .63276666957103777644277897707070223987100,    .5311203319502074688796680497925311203320,
+    .63483920917301017716738442686619237065300,    .5300207039337474120082815734989648033126,
+    .63690746223706917739093569252872839570050,    .5289256198347107438016528925619834710744,
+    .63897144645792069983514238629140891134750,    .5278350515463917525773195876288659793814,
+    .64103117942093124081992527862894348800200,    .5267489711934156378600823045267489711934,
+    .64308667860302726193566513757104985415950,    .5256673511293634496919917864476386036961,
+    .64513796137358470073053240412264131009600,    .5245901639344262295081967213114754098361,
+    .64718504499530948859131740391603671014300,    .5235173824130879345603271983640081799591,
+    .64922794662510974195157587018911726772800,    .5224489795918367346938775510204081632653,
+    .65126668331495807251485530287027359008800,    .5213849287169042769857433808553971486762,
+    .65330127201274557080523663898929953575150,    .5203252032520325203252032520325203252033,
+    .65533172956312757406749369692988693714150,    .5192697768762677484787018255578093306288,
+    .65735807270835999727154330685152672231200,    .5182186234817813765182186234817813765182,
+    .65938031808912778153342060249997302889800,    .5171717171717171717171717171717171717172,
+    .66139848224536490484126716182800009846700,    .5161290322580645161290322580645161290323,
+    .66341258161706617713093692145776003599150,    .5150905432595573440643863179074446680080,
+    .66542263254509037562201001492212526500250,    .5140562248995983935742971887550200803213,
+    .66742865127195616370414654738851822912700,    .5130260521042084168336673346693386773547,
+    .66943065394262923906154583164607174694550,    .5120000000000000000000000000000000000000,
+    .67142865660530226534774556057527661323550,    .5109780439121756487025948103792415169661,
+    .67342267521216669923234121597488410770900,    .5099601593625498007968127490039840637450,
+    .67541272562017662384192817626171745359900,    .5089463220675944333996023856858846918489,
+    .67739882359180603188519853574689477682100,    .5079365079365079365079365079365079365079,
+    .67938098479579733801614338517538271844400,    .5069306930693069306930693069306930693069,
+    .68135922480790300781450241629499942064300,    .5059288537549407114624505928853754940711,
+    .68333355911162063645036823800182901322850,    .5049309664694280078895463510848126232742,
+    .68530400309891936760919861626462079584600,    .5039370078740157480314960629921259842520,
+    .68727057207096020619019327568821609020250,    .5029469548133595284872298624754420432220,
+    .68923328123880889251040571252815425395950,    .5019607843137254901960784313725490196078,
+    .69314718055994530941723212145818, 5.0e-01,
+};
+
+
+
+#define LOGTAB_TRANSLATE(x,h) (((x) - 1.)*icvLogTab[(h)+1])
+static const double ln_2 = 0.69314718055994530941723212145818;
+
+void log( const float *_x, float *y, int n )
+{
+    static const float shift[] = { 0, -1.f/512 };
+    static const float
+    A0 = 0.3333333333333333333333333f,
+    A1 = -0.5f,
+    A2 = 1.f;
+
+#undef LOGPOLY
+#define LOGPOLY(x) (((A0*(x) + A1)*(x) + A2)*(x))
+
+    int i = 0;
+    Cv32suf buf[4];
+    const int* x = (const int*)_x;
+
+#if CV_SSE2
+    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
+    static const __m128 _1_4 = _mm_set1_ps(1.f);
+    static const __m128 shift4 = _mm_set1_ps(-1.f/512);
+
+    static const __m128 mA0 = _mm_set1_ps(A0);
+    static const __m128 mA1 = _mm_set1_ps(A1);
+    static const __m128 mA2 = _mm_set1_ps(A2);
+
+    int CV_DECL_ALIGNED(16) idx[4];
+
+    for( ; i <= n - 4; i += 4 )
+    {
+        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
+        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 23), _mm_set1_epi32(255)), _mm_set1_epi32(127));
+        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
+        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0,yi0)), ln2_2);
+
+        __m128i xi0 = _mm_or_si128(_mm_and_si128(h0, _mm_set1_epi32(LOGTAB_MASK2_32F)), _mm_set1_epi32(127 << 23));
+
+        h0 = _mm_and_si128(_mm_srli_epi32(h0, 23 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK*2));
+        _mm_store_si128((__m128i*)idx, h0);
+        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
+
+        __m128d t0, t1, t2, t3, t4;
+        t0 = _mm_load_pd(icvLogTab + idx[0]);
+        t2 = _mm_load_pd(icvLogTab + idx[1]);
+        t1 = _mm_unpackhi_pd(t0, t2);
+        t0 = _mm_unpacklo_pd(t0, t2);
+        t2 = _mm_load_pd(icvLogTab + idx[2]);
+        t4 = _mm_load_pd(icvLogTab + idx[3]);
+        t3 = _mm_unpackhi_pd(t2, t4);
+        t2 = _mm_unpacklo_pd(t2, t4);
+
+        yd0 = _mm_add_pd(yd0, t0);
+        yd1 = _mm_add_pd(yd1, t2);
+
+        __m128 yf0 = _mm_movelh_ps(_mm_cvtpd_ps(yd0), _mm_cvtpd_ps(yd1));
+
+        __m128 xf0 = _mm_sub_ps(_mm_castsi128_ps(xi0), _1_4);
+        xf0 = _mm_mul_ps(xf0, _mm_movelh_ps(_mm_cvtpd_ps(t1), _mm_cvtpd_ps(t3)));
+        xf0 = _mm_add_ps(xf0, _mm_and_ps(_mm_castsi128_ps(h0), shift4));
+
+        __m128 zf0 = _mm_mul_ps(xf0, mA0);
+        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA1), xf0);
+        zf0 = _mm_mul_ps(_mm_add_ps(zf0, mA2), xf0);
+        yf0 = _mm_add_ps(yf0, zf0);
+
+        _mm_storeu_ps(y + i, yf0);
+    }
+#endif
+    for( ; i <= n - 4; i += 4 )
+    {
+        double x0, x1, x2, x3;
+        double y0, y1, y2, y3;
+        int h0, h1, h2, h3;
+
+        h0 = x[i];
+        h1 = x[i+1];
+        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
+        buf[1].i = (h1 & LOGTAB_MASK2_32F) | (127 << 23);
+
+        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
+        y1 = (((h1 >> 23) & 0xff) - 127) * ln_2;
+
+        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        h1 = (h1 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y0 += icvLogTab[h0];
+        y1 += icvLogTab[h1];
+
+        h2 = x[i+2];
+        h3 = x[i+3];
+
+        x0 = LOGTAB_TRANSLATE( buf[0].f, h0 );
+        x1 = LOGTAB_TRANSLATE( buf[1].f, h1 );
+
+        buf[2].i = (h2 & LOGTAB_MASK2_32F) | (127 << 23);
+        buf[3].i = (h3 & LOGTAB_MASK2_32F) | (127 << 23);
+
+        y2 = (((h2 >> 23) & 0xff) - 127) * ln_2;
+        y3 = (((h3 >> 23) & 0xff) - 127) * ln_2;
+
+        h2 = (h2 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        h3 = (h3 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y2 += icvLogTab[h2];
+        y3 += icvLogTab[h3];
+
+        x2 = LOGTAB_TRANSLATE( buf[2].f, h2 );
+        x3 = LOGTAB_TRANSLATE( buf[3].f, h3 );
+
+        x0 += shift[h0 == 510];
+        x1 += shift[h1 == 510];
+        y0 += LOGPOLY( x0 );
+        y1 += LOGPOLY( x1 );
+
+        y[i] = (float) y0;
+        y[i + 1] = (float) y1;
+
+        x2 += shift[h2 == 510];
+        x3 += shift[h3 == 510];
+        y2 += LOGPOLY( x2 );
+        y3 += LOGPOLY( x3 );
+
+        y[i + 2] = (float) y2;
+        y[i + 3] = (float) y3;
+    }
+
+    for( ; i < n; i++ )
+    {
+        int h0 = x[i];
+        double y0;
+        float x0;
+
+        y0 = (((h0 >> 23) & 0xff) - 127) * ln_2;
+
+        buf[0].i = (h0 & LOGTAB_MASK2_32F) | (127 << 23);
+        h0 = (h0 >> (23 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y0 += icvLogTab[h0];
+        x0 = (float)LOGTAB_TRANSLATE( buf[0].f, h0 );
+        x0 += shift[h0 == 510];
+        y0 += LOGPOLY( x0 );
+
+        y[i] = (float)y0;
+    }
+}
+
+void log( const double *x, double *y, int n )
+{
+    static const double shift[] = { 0, -1./512 };
+    static const double
+    A7 = 1.0,
+    A6 = -0.5,
+    A5 = 0.333333333333333314829616256247390992939472198486328125,
+    A4 = -0.25,
+    A3 = 0.2,
+    A2 = -0.1666666666666666574148081281236954964697360992431640625,
+    A1 = 0.1428571428571428769682682968777953647077083587646484375,
+    A0 = -0.125;
+
+#undef LOGPOLY
+#define LOGPOLY(x,k) ((x)+=shift[k], xq = (x)*(x),\
+(((A0*xq + A2)*xq + A4)*xq + A6)*xq + \
+(((A1*xq + A3)*xq + A5)*xq + A7)*(x))
+
+    int i = 0;
+    DBLINT buf[4];
+    DBLINT *X = (DBLINT *) x;
+
+#if CV_SSE2
+    static const __m128d ln2_2 = _mm_set1_pd(ln_2);
+    static const __m128d _1_2 = _mm_set1_pd(1.);
+    static const __m128d shift2 = _mm_set1_pd(-1./512);
+
+    static const __m128i log_and_mask2 = _mm_set_epi32(LOGTAB_MASK2, 0xffffffff, LOGTAB_MASK2, 0xffffffff);
+    static const __m128i log_or_mask2 = _mm_set_epi32(1023 << 20, 0, 1023 << 20, 0);
+
+    static const __m128d mA0 = _mm_set1_pd(A0);
+    static const __m128d mA1 = _mm_set1_pd(A1);
+    static const __m128d mA2 = _mm_set1_pd(A2);
+    static const __m128d mA3 = _mm_set1_pd(A3);
+    static const __m128d mA4 = _mm_set1_pd(A4);
+    static const __m128d mA5 = _mm_set1_pd(A5);
+    static const __m128d mA6 = _mm_set1_pd(A6);
+    static const __m128d mA7 = _mm_set1_pd(A7);
+
+    int CV_DECL_ALIGNED(16) idx[4];
+
+    for( ; i <= n - 4; i += 4 )
+    {
+        __m128i h0 = _mm_loadu_si128((const __m128i*)(x + i));
+        __m128i h1 = _mm_loadu_si128((const __m128i*)(x + i + 2));
+
+        __m128d xd0 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h0, log_and_mask2), log_or_mask2));
+        __m128d xd1 = _mm_castsi128_pd(_mm_or_si128(_mm_and_si128(h1, log_and_mask2), log_or_mask2));
+
+        h0 = _mm_unpackhi_epi32(_mm_unpacklo_epi32(h0, h1), _mm_unpackhi_epi32(h0, h1));
+
+        __m128i yi0 = _mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(h0, 20),
+                                                  _mm_set1_epi32(2047)), _mm_set1_epi32(1023));
+        __m128d yd0 = _mm_mul_pd(_mm_cvtepi32_pd(yi0), ln2_2);
+        __m128d yd1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_unpackhi_epi64(yi0, yi0)), ln2_2);
+
+        h0 = _mm_and_si128(_mm_srli_epi32(h0, 20 - LOGTAB_SCALE - 1), _mm_set1_epi32(LOGTAB_MASK * 2));
+        _mm_store_si128((__m128i*)idx, h0);
+        h0 = _mm_cmpeq_epi32(h0, _mm_set1_epi32(510));
+
+        __m128d t0, t1, t2, t3, t4;
+        t0 = _mm_load_pd(icvLogTab + idx[0]);
+        t2 = _mm_load_pd(icvLogTab + idx[1]);
+        t1 = _mm_unpackhi_pd(t0, t2);
+        t0 = _mm_unpacklo_pd(t0, t2);
+        t2 = _mm_load_pd(icvLogTab + idx[2]);
+        t4 = _mm_load_pd(icvLogTab + idx[3]);
+        t3 = _mm_unpackhi_pd(t2, t4);
+        t2 = _mm_unpacklo_pd(t2, t4);
+
+        yd0 = _mm_add_pd(yd0, t0);
+        yd1 = _mm_add_pd(yd1, t2);
+
+        xd0 = _mm_mul_pd(_mm_sub_pd(xd0, _1_2), t1);
+        xd1 = _mm_mul_pd(_mm_sub_pd(xd1, _1_2), t3);
+
+        xd0 = _mm_add_pd(xd0, _mm_and_pd(_mm_castsi128_pd(_mm_unpacklo_epi32(h0, h0)), shift2));
+        xd1 = _mm_add_pd(xd1, _mm_and_pd(_mm_castsi128_pd(_mm_unpackhi_epi32(h0, h0)), shift2));
+
+        __m128d zd0 = _mm_mul_pd(xd0, mA0);
+        __m128d zd1 = _mm_mul_pd(xd1, mA0);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA1), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA1), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA2), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA2), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA3), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA3), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA4), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA4), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA5), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA5), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA6), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA6), xd1);
+        zd0 = _mm_mul_pd(_mm_add_pd(zd0, mA7), xd0);
+        zd1 = _mm_mul_pd(_mm_add_pd(zd1, mA7), xd1);
+
+        yd0 = _mm_add_pd(yd0, zd0);
+        yd1 = _mm_add_pd(yd1, zd1);
+
+        _mm_storeu_pd(y + i, yd0);
+        _mm_storeu_pd(y + i + 2, yd1);
+    }
+#endif
+    for( ; i <= n - 4; i += 4 )
+    {
+        double xq;
+        double x0, x1, x2, x3;
+        double y0, y1, y2, y3;
+        int h0, h1, h2, h3;
+
+        h0 = X[i].i.lo;
+        h1 = X[i + 1].i.lo;
+        buf[0].i.lo = h0;
+        buf[1].i.lo = h1;
+
+        h0 = X[i].i.hi;
+        h1 = X[i + 1].i.hi;
+        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
+        buf[1].i.hi = (h1 & LOGTAB_MASK2) | (1023 << 20);
+
+        y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
+        y1 = (((h1 >> 20) & 0x7ff) - 1023) * ln_2;
+
+        h2 = X[i + 2].i.lo;
+        h3 = X[i + 3].i.lo;
+        buf[2].i.lo = h2;
+        buf[3].i.lo = h3;
+
+        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        h1 = (h1 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y0 += icvLogTab[h0];
+        y1 += icvLogTab[h1];
+
+        h2 = X[i + 2].i.hi;
+        h3 = X[i + 3].i.hi;
+
+        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
+        x1 = LOGTAB_TRANSLATE( buf[1].d, h1 );
+
+        buf[2].i.hi = (h2 & LOGTAB_MASK2) | (1023 << 20);
+        buf[3].i.hi = (h3 & LOGTAB_MASK2) | (1023 << 20);
+
+        y2 = (((h2 >> 20) & 0x7ff) - 1023) * ln_2;
+        y3 = (((h3 >> 20) & 0x7ff) - 1023) * ln_2;
+
+        h2 = (h2 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+        h3 = (h3 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y2 += icvLogTab[h2];
+        y3 += icvLogTab[h3];
+
+        x2 = LOGTAB_TRANSLATE( buf[2].d, h2 );
+        x3 = LOGTAB_TRANSLATE( buf[3].d, h3 );
+
+        y0 += LOGPOLY( x0, h0 == 510 );
+        y1 += LOGPOLY( x1, h1 == 510 );
+
+        y[i] = y0;
+        y[i + 1] = y1;
+
+        y2 += LOGPOLY( x2, h2 == 510 );
+        y3 += LOGPOLY( x3, h3 == 510 );
+
+        y[i + 2] = y2;
+        y[i + 3] = y3;
+    }
+
+    for( ; i < n; i++ )
+    {
+        int h0 = X[i].i.hi;
+        double xq;
+        double x0, y0 = (((h0 >> 20) & 0x7ff) - 1023) * ln_2;
+
+        buf[0].i.hi = (h0 & LOGTAB_MASK2) | (1023 << 20);
+        buf[0].i.lo = X[i].i.lo;
+        h0 = (h0 >> (20 - LOGTAB_SCALE - 1)) & LOGTAB_MASK * 2;
+
+        y0 += icvLogTab[h0];
+        x0 = LOGTAB_TRANSLATE( buf[0].d, h0 );
+        y0 += LOGPOLY( x0, h0 == 510 );
+        y[i] = y0;
+    }
+}
+
 }}
diff --git a/modules/hal/src/matrix.cpp b/modules/hal/src/matrix.cpp
index a3f69facc..9506aaf47 100644
--- a/modules/hal/src/matrix.cpp
+++ b/modules/hal/src/matrix.cpp
@@ -44,4 +44,165 @@
 
 namespace cv { namespace hal {
 
+/****************************************************************************************\
+*                     LU & Cholesky implementation for small matrices                    *
+\****************************************************************************************/
+
+template<typename _Tp> static inline int
+LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
+{
+    int i, j, k, p = 1;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        k = i;
+
+        for( j = i+1; j < m; j++ )
+            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
+                k = j;
+
+        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
+            return 0;
+
+        if( k != i )
+        {
+            for( j = i; j < m; j++ )
+                std::swap(A[i*astep + j], A[k*astep + j]);
+            if( b )
+                for( j = 0; j < n; j++ )
+                    std::swap(b[i*bstep + j], b[k*bstep + j]);
+            p = -p;
+        }
+
+        _Tp d = -1/A[i*astep + i];
+
+        for( j = i+1; j < m; j++ )
+        {
+            _Tp alpha = A[j*astep + i]*d;
+
+            for( k = i+1; k < m; k++ )
+                A[j*astep + k] += alpha*A[i*astep + k];
+
+            if( b )
+                for( k = 0; k < n; k++ )
+                    b[j*bstep + k] += alpha*b[i*bstep + k];
+        }
+
+        A[i*astep + i] = -d;
+    }
+
+    if( b )
+    {
+        for( i = m-1; i >= 0; i-- )
+            for( j = 0; j < n; j++ )
+            {
+                _Tp s = b[i*bstep + j];
+                for( k = i+1; k < m; k++ )
+                    s -= A[i*astep + k]*b[k*bstep + j];
+                b[i*bstep + j] = s*A[i*astep + i];
+            }
+    }
+
+    return p;
+}
+
+
+int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n);
+}
+
+
+int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n);
+}
+
+
+template<typename _Tp> static inline bool
+CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
+{
+    _Tp* L = A;
+    int i, j, k;
+    double s;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < i; j++ )
+        {
+            s = A[i*astep + j];
+            for( k = 0; k < j; k++ )
+                s -= L[i*astep + k]*L[j*astep + k];
+            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
+        }
+        s = A[i*astep + i];
+        for( k = 0; k < j; k++ )
+        {
+            double t = L[i*astep + k];
+            s -= t*t;
+        }
+        if( s < std::numeric_limits<_Tp>::epsilon() )
+            return false;
+        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
+    }
+
+    if( !b )
+        return true;
+
+    // LLt x = b
+    // 1: L y = b
+    // 2. Lt x = y
+
+    /*
+     [ L00             ]  y0   b0
+     [ L10 L11         ]  y1 = b1
+     [ L20 L21 L22     ]  y2   b2
+     [ L30 L31 L32 L33 ]  y3   b3
+
+     [ L00 L10 L20 L30 ]  x0   y0
+     [     L11 L21 L31 ]  x1 = y1
+     [         L22 L32 ]  x2   y2
+     [             L33 ]  x3   y3
+     */
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = 0; k < i; k++ )
+                s -= L[i*astep + k]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    for( i = m-1; i >= 0; i-- )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = m-1; k > i; k-- )
+                s -= L[k*astep + i]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    return true;
+}
+
+
+bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
 }}
diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp
index e6923fb89..95ddac9bc 100644
--- a/modules/hal/src/precomp.hpp
+++ b/modules/hal/src/precomp.hpp
@@ -42,3 +42,7 @@
 
 #include "opencv2/hal.hpp"
 #include "opencv2/hal/intrin.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <float.h>
diff --git a/modules/hal/src/stat.cpp b/modules/hal/src/stat.cpp
index bdcf9ed72..ec3b8db5a 100644
--- a/modules/hal/src/stat.cpp
+++ b/modules/hal/src/stat.cpp
@@ -80,10 +80,10 @@ static const uchar popCountTable4[] =
     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 };
 
-Error::Code normHamming(const uchar* a, int n, int & result)
+int normHamming(const uchar* a, int n)
 {
     int i = 0;
-    result = 0;
+    int result = 0;
 #if CV_NEON
     {
         uint32x4_t bits = vmovq_n_u32(0);
@@ -104,13 +104,13 @@ Error::Code normHamming(const uchar* a, int n, int & result)
             popCountTable[a[i+2]] + popCountTable[a[i+3]];
     for( ; i < n; i++ )
         result += popCountTable[a[i]];
-    return Error::Ok;
+    return result;
 }
 
-Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result)
+int normHamming(const uchar* a, const uchar* b, int n)
 {
     int i = 0;
-    result = 0;
+    int result = 0;
 #if CV_NEON
     {
         uint32x4_t bits = vmovq_n_u32(0);
@@ -133,44 +133,44 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result)
                     popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
     for( ; i < n; i++ )
         result += popCountTable[a[i] ^ b[i]];
-    return Error::Ok;
+    return result;
 }
 
-Error::Code normHamming(const uchar* a, int n, int cellSize, int & result)
+int normHamming(const uchar* a, int n, int cellSize)
 {
     if( cellSize == 1 )
-        return normHamming(a, n, result);
+        return normHamming(a, n);
     const uchar* tab = 0;
     if( cellSize == 2 )
         tab = popCountTable2;
     else if( cellSize == 4 )
         tab = popCountTable4;
     else
-        return Error::Unknown;
+        return -1;
     int i = 0;
-    result = 0;
+    int result = 0;
 #if CV_ENABLE_UNROLLED
     for( ; i <= n - 4; i += 4 )
         result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
 #endif
     for( ; i < n; i++ )
         result += tab[a[i]];
-    return Error::Ok;
+    return result;
 }
 
-Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int & result)
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
 {
     if( cellSize == 1 )
-        return normHamming(a, b, n, result);
+        return normHamming(a, b, n);
     const uchar* tab = 0;
     if( cellSize == 2 )
         tab = popCountTable2;
     else if( cellSize == 4 )
         tab = popCountTable4;
     else
-        return Error::Unknown;
+        return -1;
     int i = 0;
-    result = 0;
+    int result = 0;
     #if CV_ENABLE_UNROLLED
     for( ; i <= n - 4; i += 4 )
         result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
@@ -178,7 +178,129 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int
     #endif
     for( ; i < n; i++ )
         result += tab[a[i] ^ b[i]];
-    return Error::Ok;
+    return result;
+}
+
+float normL2Sqr_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
+        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
+            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
+        }
+    }
+
+    for( ; j < n; j++ )
+    {
+        float t = a[j] - b[j];
+        d += t*t;
+    }
+    return d;
+}
+
+
+float normL1_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+    __m128 absmask = _mm_load_ps((const float*)absbuf);
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
+        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#elif CV_NEON
+    float32x4_t v_sum = vdupq_n_f32(0.0f);
+    for ( ; j <= n - 4; j += 4)
+        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
+
+    float CV_DECL_ALIGNED(16) buf[4];
+    vst1q_f32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+int normL1_(const uchar* a, const uchar* b, int n)
+{
+    int j = 0, d = 0;
+#if CV_SSE
+    __m128i d0 = _mm_setzero_si128();
+
+    for( ; j <= n - 16; j += 16 )
+    {
+        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
+        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+
+    for( ; j <= n - 4; j += 4 )
+    {
+        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
+        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
+#elif CV_NEON
+    uint32x4_t v_sum = vdupq_n_u32(0.0f);
+    for ( ; j <= n - 16; j += 16)
+    {
+        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
+        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
+    }
+
+    uint CV_DECL_ALIGNED(16) buf[4];
+    vst1q_u32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
 }
 
 }} //cv::hal
diff --git a/modules/photo/src/arrays.hpp b/modules/photo/src/arrays.hpp
index 4aec5f7a1..cdd59a328 100644
--- a/modules/photo/src/arrays.hpp
+++ b/modules/photo/src/arrays.hpp
@@ -44,6 +44,9 @@
 #ifndef __OPENCV_DENOISING_ARRAYS_HPP__
 #define __OPENCV_DENOISING_ARRAYS_HPP__
 
+namespace cv
+{
+
 template <class T>
 struct Array2d
 {
@@ -176,4 +179,6 @@ struct Array4d
     }
 };
 
+}
+
 #endif
diff --git a/modules/stitching/src/autocalib.cpp b/modules/stitching/src/autocalib.cpp
index 56a9df57b..91244bde1 100644
--- a/modules/stitching/src/autocalib.cpp
+++ b/modules/stitching/src/autocalib.cpp
@@ -49,7 +49,7 @@ namespace {
 template<typename _Tp> static inline bool
 decomposeCholesky(_Tp* A, size_t astep, int m)
 {
-    if (!Cholesky(A, astep, m, 0, 0, 0))
+    if (!hal::Cholesky(A, astep, m, 0, 0, 0))
         return false;
     astep /= sizeof(A[0]);
     for (int i = 0; i < m; ++i)

From c7121e877202c44d4def8f9c1d5113c6ee646b39 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 16 Apr 2015 23:42:00 +0300
Subject: [PATCH 21/48] fixed compile warnings

---
 modules/hal/include/opencv2/hal/defs.h     |  2 ++
 modules/hal/include/opencv2/hal/intrin.hpp | 27 ++++++++++++++--------
 modules/hal/src/mathfuncs.cpp              |  4 ++--
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h
index c011fe617..18c1adc6c 100644
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -666,6 +666,8 @@ template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v)
 
 //! @endcond
 
+//! @} core_utils
+
 }
 
 #endif // __cplusplus
diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
index c3c47e059..bfd41ad0d 100644
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -350,7 +350,6 @@ OPENCV_HAL_IMPL_BIT_OP(^)
 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
 {
     v_reg<_Tp, n> c;
-    typedef typename TypeTraits<_Tp>::int_type itype;
     for( int i = 0; i < n; i++ )
         c.s[i] = TypeTraits<_Tp>::reinterpret_from_int(~TypeTraits<_Tp>::reinterpret_int(a.s[i]));
         return c;
@@ -1426,13 +1425,13 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
 
 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
 {
-    __m128i delta = _mm_set1_epi8((char)0x80);
+    __m128i delta = _mm_set1_epi8((char)-128);
     return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
                                                        _mm_xor_si128(b.val, delta))));
 }
 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
 {
-    __m128i delta = _mm_set1_epi8((char)0x80);
+    __m128i delta = _mm_set1_epi8((char)-128);
     return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
                                                        _mm_xor_si128(b.val, delta))));
 }
@@ -1523,8 +1522,8 @@ inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
 }
 
-OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)0x80)
-OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)0x8000)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
 
 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
@@ -1553,21 +1552,21 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
 
-#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, sbit) \
+#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
 } \
 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
 { \
-    __m128i smask = _mm_set1_epi8(sbit); \
+    __m128i smask = _mm_set1_epi32(smask32); \
     __m128i a1 = _mm_xor_si128(a.val, smask); \
     __m128i b1 = _mm_xor_si128(b.val, smask); \
     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
 }
 
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (char)0x80)
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (char)0x8000)
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
 
 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
@@ -1704,7 +1703,7 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
 
 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
-inline bool v_signmask(const _Tpvec& a) \
+inline int v_signmask(const _Tpvec& a) \
 { \
     return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
 } \
@@ -2850,4 +2849,12 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
 
 }}
 
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif
+
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif
+
 #endif
diff --git a/modules/hal/src/mathfuncs.cpp b/modules/hal/src/mathfuncs.cpp
index e970cfedb..7d0199f00 100644
--- a/modules/hal/src/mathfuncs.cpp
+++ b/modules/hal/src/mathfuncs.cpp
@@ -185,7 +185,7 @@ void magnitude(const double* x, const double* y, double* mag, int len)
 {
     int i = 0;
 
-#if defined CV_SIMD128_64F && CV_SIMD128_64F
+#if CV_SIMD128_64F
     for( ; i <= len - 4; i += 4 )
     {
         v_float64x2 x0 = v_load(x + i), x1 = v_load(x + i + 2);
@@ -262,7 +262,7 @@ void sqrt(const double* src, double* dst, int len)
 {
     int i = 0;
 
-#if defined CV_SIMD128_64F && CV_SIMD128_64F
+#if CV_SIMD128_64F
     for( ; i <= len - 4; i += 4 )
     {
         v_float64x2 t0 = v_load(src + i), t1 = v_load(src + i + 2);

From 1fcc83b84b7ff2a6c945d3a26cc487aae0352b70 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 16 Apr 2015 23:52:23 +0300
Subject: [PATCH 22/48] fixed some more compile warnings

---
 modules/hal/include/opencv2/hal/intrin.hpp | 18 +++++++++---------
 modules/hal/src/precomp.hpp                |  1 +
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
index bfd41ad0d..adb95d6a1 100644
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -1140,7 +1140,7 @@ inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a)
 template<typename _Tpvec> inline v_float64x2 v_reinterpret_as_f64(const _Tpvec& a)
 { return v_float64x2(_mm_castsi128_pd(a.val)); }
 
-inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a)
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a)
 { return v_float64x2(_mm_castps_pd(a.val)); }
 
 inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b)
@@ -1893,10 +1893,10 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
 
-    __m128 v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
-    __m128 v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
-    __m128 v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
-    __m128 v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
 
     u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
     u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
@@ -1928,10 +1928,10 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
 
-    __m128 v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
-    __m128 v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
-    __m128 v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
-    __m128 v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
 
     u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
     u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp
index 95ddac9bc..630565bec 100644
--- a/modules/hal/src/precomp.hpp
+++ b/modules/hal/src/precomp.hpp
@@ -45,4 +45,5 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
+#include <limits>
 #include <float.h>

From bfbf864a73210112d49867632d60d23dc30a143f Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Fri, 17 Apr 2015 00:13:08 +0300
Subject: [PATCH 23/48] fixed some more compile warnings (2)

---
 modules/hal/include/opencv2/hal/defs.h     |  4 +-
 modules/hal/include/opencv2/hal/intrin.hpp | 52 +++-------------------
 2 files changed, 8 insertions(+), 48 deletions(-)

diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h
index 18c1adc6c..197533993 100644
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -666,10 +666,10 @@ template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v)
 
 //! @endcond
 
-//! @} core_utils
-
 }
 
 #endif // __cplusplus
 
+//! @} core_utils
+
 #endif //__OPENCV_HAL_H__
diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
index adb95d6a1..3c53d6079 100644
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -905,10 +905,6 @@ struct v_uint8x16
                             (char)v8, (char)v9, (char)v10, (char)v11,
                             (char)v12, (char)v13, (char)v14, (char)v15);
     }
-    uchar get(const int i) const
-    {
-        return (uchar)(_mm_extract_epi16(val, i/2) >> ((i&1)*8));
-    }
     uchar get0() const
     {
         return (uchar)_mm_cvtsi128_si32(val);
@@ -928,10 +924,6 @@ struct v_int8x16
                             (char)v8, (char)v9, (char)v10, (char)v11,
                             (char)v12, (char)v13, (char)v14, (char)v15);
     }
-    schar get(const int i) const
-    {
-        return (schar)(_mm_extract_epi16(val, i/2) >> ((i&1)*8));
-    }
     schar get0() const
     {
         return (schar)_mm_cvtsi128_si32(val);
@@ -948,10 +940,6 @@ struct v_uint16x8
         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
                              (short)v4, (short)v5, (short)v6, (short)v7);
     }
-    ushort get(const int i) const
-    {
-        return (ushort)_mm_extract_epi16(val, i);
-    }
     ushort get0() const
     {
         return (ushort)_mm_cvtsi128_si32(val);
@@ -968,10 +956,6 @@ struct v_int16x8
         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
                              (short)v4, (short)v5, (short)v6, (short)v7);
     }
-    short get(const int i) const
-    {
-        return (short)_mm_extract_epi16(val, i);
-    }
     short get0() const
     {
         return (short)_mm_cvtsi128_si32(val);
@@ -986,12 +970,6 @@ struct v_uint32x4
     {
         val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
     }
-    unsigned get(const int i) const
-    {
-        unsigned CV_DECL_ALIGNED(16) buf[4];
-        _mm_store_si128((__m128i*)buf, val);
-        return buf[i];
-    }
     unsigned get0() const
     {
         return (unsigned)_mm_cvtsi128_si32(val);
@@ -1006,12 +984,6 @@ struct v_int32x4
     {
         val = _mm_setr_epi32(v0, v1, v2, v3);
     }
-    int get(int i) const
-    {
-        int CV_DECL_ALIGNED(16) buf[4];
-        _mm_store_si128((__m128i*)buf, val);
-        return buf[i];
-    }
     int get0() const
     {
         return _mm_cvtsi128_si32(val);
@@ -1026,12 +998,6 @@ struct v_float32x4
     {
         val = _mm_setr_ps(v0, v1, v2, v3);
     }
-    float get(int i) const
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        _mm_store_ps(buf, val);
-        return buf[i];
-    }
     float get0() const
     {
         return _mm_cvtss_f32(val);
@@ -1046,12 +1012,6 @@ struct v_float64x2
     {
         val = _mm_setr_pd(v0, v1);
     }
-    double get(int i) const
-    {
-        double CV_DECL_ALIGNED(16) buf[2];
-        _mm_store_pd(buf, val);
-        return buf[i];
-    }
     double get0() const
     {
         return _mm_cvtsd_f64(val);
@@ -1376,10 +1336,10 @@ OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
 
-inline v_float32x4 v_sqrt(v_float32x4 x)
+inline v_float32x4 v_sqrt(const v_float32x4& x)
 { return v_float32x4(_mm_sqrt_ps(x.val)); }
 
-inline v_float32x4 v_invsqrt(v_float32x4 x)
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
 {
     static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
     __m128 t = x.val;
@@ -1389,18 +1349,18 @@ inline v_float32x4 v_invsqrt(v_float32x4 x)
     return v_float32x4(t);
 }
 
-inline v_float64x2 v_sqrt(v_float64x2 x)
+inline v_float64x2 v_sqrt(const v_float64x2& x)
 { return v_float64x2(_mm_sqrt_pd(x.val)); }
 
-inline v_float64x2 v_invsqrt(v_float64x2 x)
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
 {
     static const __m128d v_1 = _mm_set1_pd(1.);
     return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
 }
 
-inline v_float32x4 v_abs(v_float32x4 x)
+inline v_float32x4 v_abs(const v_float32x4& x)
 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
-inline v_float64x2 v_abs(v_float64x2 x)
+inline v_float64x2 v_abs(const v_float64x2& x)
 {
     return v_float64x2(_mm_and_pd(x.val,
         _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));

From ce7590bd248a418eab6245ec6f9bf70b885904c6 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Fri, 17 Apr 2015 08:32:06 +0300
Subject: [PATCH 24/48] continuing fixing universal intrinsics

---
 modules/hal/include/opencv2/hal/intrin.hpp | 38 +++++++++++-----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
index 3c53d6079..e959a49cf 100644
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -2218,7 +2218,7 @@ inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
@@ -2342,11 +2342,11 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
 
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, OPENCV_HAL_NOP, s8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, OPENCV_HAL_NOP, s16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, OPENCV_HAL_NOP, s32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
 
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
@@ -2381,18 +2381,18 @@ inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_
 }
 
 // trade efficiency for convenience
-#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, _Tp, suffix) \
+#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
 inline _Tpvec operator << (const _Tpvec& a, int n) \
-{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##suffix((_Tp)n))); } \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
 inline _Tpvec operator >> (const _Tpvec& a, int n) \
-{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##suffix((_Tp)-n))); }
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); }
 
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
 
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mullo, vmulq_u16)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mullo, vmulq_s16)
@@ -2444,8 +2444,8 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
 
 inline int v_signmask(const v_uint8x16& a)
 {
-    uint8x8_t m0 = vcreate_u8(CV_BIG_UINT(0x0706050403020100));
-    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_u8(m0, m0));
+    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
 }
@@ -2454,8 +2454,8 @@ inline int v_signmask(const v_int8x16& a)
 
 inline int v_signmask(const v_uint16x8& a)
 {
-    uint16x4_t m0 = vcreate_u16(CV_BIG_UINT(0x0003000200010000));
-    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_u16(m0, m0));
+    int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
 }
@@ -2464,8 +2464,8 @@ inline int v_signmask(const v_int16x8& a)
 
 inline int v_signmask(const v_uint32x4& a)
 {
-    uint32x2_t m0 = vcreate_u32(CV_BIG_UINT(0x0000000100000000));
-    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_u32(m0, m0));
+    int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
     uint64x2_t v1 = vpaddlq_u32(v0);
     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
 }

From 17675b4cb83810dcd582cca0eb0d1cd7ddb545e8 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Fri, 17 Apr 2015 10:51:49 +0300
Subject: [PATCH 25/48] trying to fix test failures

---
 modules/hal/include/opencv2/hal/intrin.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
index e959a49cf..b067301d8 100644
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -2287,16 +2287,16 @@ inline v_float32x4 operator ~ (const v_float32x4& a)
 inline v_float32x4 v_sqrt(const v_float32x4& x)
 {
     float32x4_t e = vrsqrteq_f32(x.val);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e);
-    return v_float32x4(vmulq_f32(e, x.val));
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    return v_float32x4(vmulq_f32(x.val, e));
 }
 
 inline v_float32x4 v_invsqrt(const v_float32x4& x)
 {
     float32x4_t e = vrsqrteq_f32(x.val);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), x.val), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
     return v_float32x4(e);
 }
 

From 92f2ad0a1128f5559708359efe01899586c89c33 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Fri, 17 Apr 2015 11:32:20 +0300
Subject: [PATCH 26/48] trying to fix test failures (take 2)

---
 modules/hal/include/opencv2/hal/intrin.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
index b067301d8..fc40cfda8 100644
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -47,6 +47,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <float.h>
 #include <stdlib.h>
 
 #define OPENCV_HAL_ADD(a, b) ((a) + (b))
@@ -2286,9 +2287,10 @@ inline v_float32x4 operator ~ (const v_float32x4& a)
 
 inline v_float32x4 v_sqrt(const v_float32x4& x)
 {
-    float32x4_t e = vrsqrteq_f32(x.val);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
+    float32x4_t e = vrsqrteq_f32(x1);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
     return v_float32x4(vmulq_f32(x.val, e));
 }
 

From 316d76bdb6347fb530578cf0ea9208446bc5a5b8 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Fri, 17 Apr 2015 16:00:33 +0300
Subject: [PATCH 27/48] Rename OpenCV Engine to org.opencv.engine3 for release
 candidate

---
 modules/java/CMakeLists.txt                   |  4 ++--
 .../src/java/android+AsyncServiceHelper.java  |  8 +++----
 ...idl => engine3+OpenCVEngineInterface.aidl} |  2 +-
 .../service/engine/AndroidManifest.xml        |  6 ++---
 .../jni/JNIWrapper/HardwareDetector_jni.cpp   |  8 +++----
 .../jni/JNIWrapper/HardwareDetector_jni.h     |  8 +++----
 .../jni/JNIWrapper/OpenCVEngine_jni.cpp       |  6 ++---
 .../engine/jni/JNIWrapper/OpenCVEngine_jni.h  |  6 ++---
 .../jni/JNIWrapper/OpenCVLibraryInfo.cpp      | 10 ++++-----
 .../engine/jni/JNIWrapper/OpenCVLibraryInfo.h | 10 ++++-----
 .../engine/jni/NativeService/PackageInfo.cpp  |  2 +-
 .../service/engine/jni/include/EngineCommon.h |  4 ++--
 .../{engine => engine3}/BinderConnector.java  |  2 +-
 .../{engine => engine3}/HardwareDetector.java |  2 +-
 .../{engine => engine3}/MarketConnector.java  |  4 ++--
 .../OpenCVEngineInterface.aidl                |  2 +-
 .../OpenCVEngineService.java                  |  2 +-
 .../OpenCVLibraryInfo.java                    |  2 +-
 .../manager/ManagerActivity.java              | 22 +++++++++----------
 .../manager/PackageListAdapter.java           |  2 +-
 .../service/engine_test/AndroidManifest.xml   |  6 ++---
 .../test/EngineInterfaceTest.java             | 16 +++++++-------
 22 files changed, 67 insertions(+), 67 deletions(-)
 rename modules/java/generator/src/java/{engine+OpenCVEngineInterface.aidl => engine3+OpenCVEngineInterface.aidl} (97%)
 rename platforms/android/service/engine/src/org/opencv/{engine => engine3}/BinderConnector.java (97%)
 rename platforms/android/service/engine/src/org/opencv/{engine => engine3}/HardwareDetector.java (98%)
 rename platforms/android/service/engine/src/org/opencv/{engine => engine3}/MarketConnector.java (97%)
 rename platforms/android/service/engine/src/org/opencv/{engine => engine3}/OpenCVEngineInterface.aidl (97%)
 rename platforms/android/service/engine/src/org/opencv/{engine => engine3}/OpenCVEngineService.java (98%)
 rename platforms/android/service/engine/src/org/opencv/{engine => engine3}/OpenCVLibraryInfo.java (97%)
 rename platforms/android/service/engine/src/org/opencv/{engine => engine3}/manager/ManagerActivity.java (97%)
 rename platforms/android/service/engine/src/org/opencv/{engine => engine3}/manager/PackageListAdapter.java (96%)
 rename platforms/android/service/engine_test/src/org/opencv/{engine => engine3}/test/EngineInterfaceTest.java (91%)

diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 3ba9c54dc..ce24daf79 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -174,8 +174,8 @@ endforeach()
 file(REMOVE_RECURSE "${probe_dir}")
 
 if(NOT ANDROID)
-  ocv_list_filterout(handwritten_java_sources "/(engine|android)\\\\+")
-  ocv_list_filterout(handwritten_aidl_sources "/(engine|android)\\\\+")
+  ocv_list_filterout(handwritten_java_sources "/(engine3|android)\\\\+")
+  ocv_list_filterout(handwritten_aidl_sources "/(engine3|android)\\\\+")
 else()
   file(GLOB_RECURSE handwrittren_lib_project_files_rel RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/android_lib/" "${CMAKE_CURRENT_SOURCE_DIR}/android_lib/*")
   list(REMOVE_ITEM handwrittren_lib_project_files_rel "${ANDROID_MANIFEST_FILE}")
diff --git a/modules/java/generator/src/java/android+AsyncServiceHelper.java b/modules/java/generator/src/java/android+AsyncServiceHelper.java
index e18d5a500..9fdf5ac1f 100644
--- a/modules/java/generator/src/java/android+AsyncServiceHelper.java
+++ b/modules/java/generator/src/java/android+AsyncServiceHelper.java
@@ -4,7 +4,7 @@ import java.io.File;
 import java.util.StringTokenizer;
 
 import org.opencv.core.Core;
-import org.opencv.engine.OpenCVEngineInterface;
+import org.opencv.engine3.OpenCVEngineInterface;
 
 import android.content.ComponentName;
 import android.content.Context;
@@ -21,8 +21,8 @@ class AsyncServiceHelper
             final LoaderCallbackInterface Callback)
     {
         AsyncServiceHelper helper = new AsyncServiceHelper(Version, AppContext, Callback);
-        Intent intent = new Intent("org.opencv.engine.BIND");
-        intent.setPackage("org.opencv.engine");
+        Intent intent = new Intent("org.opencv.engine3.BIND");
+        intent.setPackage("org.opencv.engine3");
         if (AppContext.bindService(intent, helper.mServiceConnection, Context.BIND_AUTO_CREATE))
         {
             return true;
@@ -151,7 +151,7 @@ class AsyncServiceHelper
     /**
      *  URL of OpenCV Manager page on Google Play Market.
      */
-    protected static final String OPEN_CV_SERVICE_URL = "market://details?id=org.opencv.engine";
+    protected static final String OPEN_CV_SERVICE_URL = "market://details?id=org.opencv.engine3";
 
     protected ServiceConnection mServiceConnection = new ServiceConnection()
     {
diff --git a/modules/java/generator/src/java/engine+OpenCVEngineInterface.aidl b/modules/java/generator/src/java/engine3+OpenCVEngineInterface.aidl
similarity index 97%
rename from modules/java/generator/src/java/engine+OpenCVEngineInterface.aidl
rename to modules/java/generator/src/java/engine3+OpenCVEngineInterface.aidl
index 21fe5f716..b84eaaafb 100644
--- a/modules/java/generator/src/java/engine+OpenCVEngineInterface.aidl
+++ b/modules/java/generator/src/java/engine3+OpenCVEngineInterface.aidl
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 /**
 * Class provides a Java interface for OpenCV Engine Service. It's synchronous with native OpenCVEngine class.
diff --git a/platforms/android/service/engine/AndroidManifest.xml b/platforms/android/service/engine/AndroidManifest.xml
index 4f78c314a..40adb98d7 100644
--- a/platforms/android/service/engine/AndroidManifest.xml
+++ b/platforms/android/service/engine/AndroidManifest.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="org.opencv.engine"
+    package="org.opencv.engine3"
     android:versionCode="300@ANDROID_PLATFORM_VERSION_CODE@"
     android:versionName="3.00" >
 
@@ -13,12 +13,12 @@
 
     <service android:exported="true" android:name="OpenCVEngineService" android:process=":OpenCVEngineProcess">
         <intent-filter>
-            <action android:name="org.opencv.engine.BIND"></action>
+            <action android:name="org.opencv.engine3.BIND"></action>
         </intent-filter>
     </service>
 
     <activity
-        android:name="org.opencv.engine.manager.ManagerActivity"
+        android:name="org.opencv.engine3.manager.ManagerActivity"
         android:label="@string/app_name"
         android:screenOrientation="portrait">
         <intent-filter>
diff --git a/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp
index 3e490a316..0145abb8c 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp
+++ b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.cpp
@@ -3,23 +3,23 @@
 #include <jni.h>
 #include <string>
 
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetCpuID(JNIEnv* , jclass)
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_GetCpuID(JNIEnv* , jclass)
 {
     return GetCpuID();
 }
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_HardwareDetector_GetPlatformName(JNIEnv* env, jclass)
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_HardwareDetector_GetPlatformName(JNIEnv* env, jclass)
 {
     std::string hardware_name = GetPlatformName();
     return env->NewStringUTF(hardware_name.c_str());
 }
 
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetProcessorCount(JNIEnv* , jclass)
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_GetProcessorCount(JNIEnv* , jclass)
 {
     return GetProcessorCount();
 }
 
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_DetectKnownPlatforms(JNIEnv* , jclass)
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_DetectKnownPlatforms(JNIEnv* , jclass)
 {
     return DetectKnownPlatforms();
 }
diff --git a/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h
index 43fad33c8..61f294e99 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h
+++ b/platforms/android/service/engine/jni/JNIWrapper/HardwareDetector_jni.h
@@ -14,7 +14,7 @@ extern "C" {
  * Method:    GetCpuID
  * Signature: ()I
  */
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetCpuID
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_GetCpuID
 (JNIEnv *, jclass);
 
 /*
@@ -22,7 +22,7 @@ JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetCpuID
  * Method:    GetPlatformName
  * Signature: ()Ljava/lang/String;
  */
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_HardwareDetector_GetPlatformName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_HardwareDetector_GetPlatformName
 (JNIEnv *, jclass);
 
 /*
@@ -30,7 +30,7 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_HardwareDetector_GetPlatformNam
  * Method:    GetProcessorCount
  * Signature: ()I
  */
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetProcessorCount
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_GetProcessorCount
 (JNIEnv *, jclass);
 
 /*
@@ -38,7 +38,7 @@ JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_GetProcessorCount
  * Method:    DetectKnownPlatforms
  * Signature: ()I
  */
-JNIEXPORT jint JNICALL Java_org_opencv_engine_HardwareDetector_DetectKnownPlatforms
+JNIEXPORT jint JNICALL Java_org_opencv_engine3_HardwareDetector_DetectKnownPlatforms
 (JNIEnv *, jclass);
 
 #ifdef __cplusplus
diff --git a/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp
index dac491656..1dd038125 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp
+++ b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.cpp
@@ -15,7 +15,7 @@ using namespace android;
 sp<IBinder> OpenCVEngineBinder = NULL;
 IPackageManager* PackageManager = NULL;
 
-JNIEXPORT jobject JNICALL Java_org_opencv_engine_BinderConnector_Connect(JNIEnv* env, jobject)
+JNIEXPORT jobject JNICALL Java_org_opencv_engine3_BinderConnector_Connect(JNIEnv* env, jobject)
 {
     LOGI("Creating new component");
     if (NULL != OpenCVEngineBinder.get())
@@ -30,7 +30,7 @@ JNIEXPORT jobject JNICALL Java_org_opencv_engine_BinderConnector_Connect(JNIEnv*
     return javaObjectForIBinder(env, OpenCVEngineBinder);
 }
 
-JNIEXPORT jboolean JNICALL Java_org_opencv_engine_BinderConnector_Init(JNIEnv* env, jobject , jobject market)
+JNIEXPORT jboolean JNICALL Java_org_opencv_engine3_BinderConnector_Init(JNIEnv* env, jobject , jobject market)
 {
     LOGD("Java_org_opencv_engine_BinderConnector_Init");
 
@@ -58,7 +58,7 @@ JNIEXPORT jboolean JNICALL Java_org_opencv_engine_BinderConnector_Init(JNIEnv* e
     }
 }
 
-JNIEXPORT void JNICALL Java_org_opencv_engine_BinderConnector_Final(JNIEnv *, jobject)
+JNIEXPORT void JNICALL Java_org_opencv_engine3_BinderConnector_Final(JNIEnv *, jobject)
 {
     LOGD("Java_org_opencv_engine_BinderConnector_Final");
 
diff --git a/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h
index cd0734eb0..bfeafb049 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h
+++ b/platforms/android/service/engine/jni/JNIWrapper/OpenCVEngine_jni.h
@@ -12,7 +12,7 @@ extern "C" {
  * Method:    Connect
  * Signature: ()Landroid/os/IBinder;
  */
-JNIEXPORT jobject JNICALL Java_org_opencv_engine_BinderConnector_Connect
+JNIEXPORT jobject JNICALL Java_org_opencv_engine3_BinderConnector_Connect
   (JNIEnv *, jobject);
 
 /*
@@ -20,7 +20,7 @@ JNIEXPORT jobject JNICALL Java_org_opencv_engine_BinderConnector_Connect
  * Method:    Init
  * Signature: (Lorg/opencv/engine/MarketConnector;)Z
  */
-JNIEXPORT jboolean JNICALL Java_org_opencv_engine_BinderConnector_Init
+JNIEXPORT jboolean JNICALL Java_org_opencv_engine3_BinderConnector_Init
   (JNIEnv *, jobject, jobject);
 
 /*
@@ -28,7 +28,7 @@ JNIEXPORT jboolean JNICALL Java_org_opencv_engine_BinderConnector_Init
  * Method:    Final
  * Signature: ()V
  */
-JNIEXPORT void JNICALL Java_org_opencv_engine_BinderConnector_Final
+JNIEXPORT void JNICALL Java_org_opencv_engine3_BinderConnector_Final
   (JNIEnv *, jobject);
 
 #ifdef __cplusplus
diff --git a/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
index e7dc6d2f1..f1c5ec19a 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
+++ b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
@@ -3,7 +3,7 @@
 #include <utils/Log.h>
 #include <dlfcn.h>
 
-JNIEXPORT jlong JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_open
+JNIEXPORT jlong JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_open
   (JNIEnv * env, jobject, jstring str)
 {
     const char* infoLibPath = env->GetStringUTFChars(str, NULL);
@@ -21,7 +21,7 @@ JNIEXPORT jlong JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_open
     return (jlong)handle;
 }
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getPackageName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getPackageName
   (JNIEnv* env, jobject, jlong handle)
 {
     InfoFunctionType info_func;
@@ -41,7 +41,7 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getPackageNam
     return env->NewStringUTF(result);
 }
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getLibraryList
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getLibraryList
   (JNIEnv* env, jobject, jlong handle)
 {
     InfoFunctionType info_func;
@@ -61,7 +61,7 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getLibraryLis
     return env->NewStringUTF(result);
 }
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getVersionName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getVersionName
   (JNIEnv* env, jobject, jlong handle)
 {
     InfoFunctionType info_func;
@@ -81,7 +81,7 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getVersionNam
     return env->NewStringUTF(result);
 }
 
-JNIEXPORT void JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_close
+JNIEXPORT void JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_close
   (JNIEnv*, jobject, jlong handle)
 {
     dlclose((void*)handle);
diff --git a/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h
index b02050ffd..574f0b4e0 100644
--- a/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h
+++ b/platforms/android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.h
@@ -6,19 +6,19 @@
 extern "C" {
 #endif
 
-JNIEXPORT jlong JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_open
+JNIEXPORT jlong JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_open
   (JNIEnv *, jobject, jstring);
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getPackageName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getPackageName
   (JNIEnv *, jobject, jlong);
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getLibraryList
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getLibraryList
   (JNIEnv *, jobject, jlong);
 
-JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getVersionName
+JNIEXPORT jstring JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_getVersionName
   (JNIEnv *, jobject, jlong);
 
-JNIEXPORT void JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_close
+JNIEXPORT void JNICALL Java_org_opencv_engine3_OpenCVLibraryInfo_close
   (JNIEnv *, jobject, jlong);
 
 #ifdef __cplusplus
diff --git a/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
index d831bf7a5..d95f9f943 100644
--- a/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
+++ b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
@@ -27,7 +27,7 @@ map<int, string> PackageInfo::InitPlatformNameMap()
 
 const map<int, string> PackageInfo::PlatformNameMap = InitPlatformNameMap();
 const string PackageInfo::BasePackageName = "org.opencv.lib";
-const string  DEFAULT_ENGINE_INSTALL_PATH = "/data/data/org.opencv.engine";
+const string  DEFAULT_ENGINE_INSTALL_PATH = "/data/data/org.opencv.engine3";
 
 inline string JoinARMFeatures(int cpu_id)
 {
diff --git a/platforms/android/service/engine/jni/include/EngineCommon.h b/platforms/android/service/engine/jni/include/EngineCommon.h
index a03f02c68..2948db8c1 100644
--- a/platforms/android/service/engine/jni/include/EngineCommon.h
+++ b/platforms/android/service/engine/jni/include/EngineCommon.h
@@ -13,9 +13,9 @@
 #define LIB_OPENCV_INFO_NAME "libopencv_info.so"
 
 // OpenCV Manager package name
-#define OPENCV_ENGINE_PACKAGE "org.opencv.engine"
+#define OPENCV_ENGINE_PACKAGE "org.opencv.engine3"
 // Class name of OpenCV engine binder object. Is needned for connection to service
-#define OPECV_ENGINE_CLASSNAME "org.opencv.engine.OpenCVEngineInterface"
+#define OPECV_ENGINE_CLASSNAME "org.opencv.engine3.OpenCVEngineInterface"
 
 typedef const char* (*InfoFunctionType)();
 
diff --git a/platforms/android/service/engine/src/org/opencv/engine/BinderConnector.java b/platforms/android/service/engine/src/org/opencv/engine3/BinderConnector.java
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/BinderConnector.java
rename to platforms/android/service/engine/src/org/opencv/engine3/BinderConnector.java
index bde54d5b9..a54843354 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/BinderConnector.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/BinderConnector.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 import android.os.IBinder;
 
diff --git a/platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java b/platforms/android/service/engine/src/org/opencv/engine3/HardwareDetector.java
similarity index 98%
rename from platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java
rename to platforms/android/service/engine/src/org/opencv/engine3/HardwareDetector.java
index f115070aa..65ee243d3 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/HardwareDetector.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/HardwareDetector.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 public class HardwareDetector
 {
diff --git a/platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java b/platforms/android/service/engine/src/org/opencv/engine3/MarketConnector.java
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java
rename to platforms/android/service/engine/src/org/opencv/engine3/MarketConnector.java
index da595915f..4e5f51acc 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/MarketConnector.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/MarketConnector.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 import java.util.ArrayList;
 import java.util.Iterator;
@@ -99,7 +99,7 @@ public class MarketConnector
         List<PackageInfo> AllPackages = mContext.getPackageManager().getInstalledPackages(PackageManager.GET_CONFIGURATIONS);
         List<PackageInfo> OpenCVPackages = new ArrayList<PackageInfo>();
         try {
-            OpenCVPackages.add(mContext.getPackageManager().getPackageInfo("org.opencv.engine", PackageManager.GET_CONFIGURATIONS));
+            OpenCVPackages.add(mContext.getPackageManager().getPackageInfo("org.opencv.engine3", PackageManager.GET_CONFIGURATIONS));
         } catch (NameNotFoundException e) {
             Log.e(TAG, "OpenCV Manager package info was not found!");
             e.printStackTrace();
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineInterface.aidl
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
rename to platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineInterface.aidl
index 13e0f7f84..2b957d4b0 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
+++ b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineInterface.aidl
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 /**
 * Class provides Java interface to OpenCV Engine Service. Is synchronious with native OpenCVEngine class.
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineService.java
similarity index 98%
rename from platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
rename to platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineService.java
index b3c4ea057..c7df4a811 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineService.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVEngineService.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 import android.app.Service;
 import android.content.Intent;
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVLibraryInfo.java
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java
rename to platforms/android/service/engine/src/org/opencv/engine3/OpenCVLibraryInfo.java
index d0f67bfde..cc36b152a 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVLibraryInfo.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/OpenCVLibraryInfo.java
@@ -1,4 +1,4 @@
-package org.opencv.engine;
+package org.opencv.engine3;
 
 public class OpenCVLibraryInfo {
     public OpenCVLibraryInfo(String packagePath) {
diff --git a/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java b/platforms/android/service/engine/src/org/opencv/engine3/manager/ManagerActivity.java
similarity index 97%
rename from platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
rename to platforms/android/service/engine/src/org/opencv/engine3/manager/ManagerActivity.java
index b4e0be5a9..7308e848e 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/manager/ManagerActivity.java
@@ -1,15 +1,15 @@
-package org.opencv.engine.manager;
+package org.opencv.engine3.manager;
 
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.StringTokenizer;
 
-import org.opencv.engine.HardwareDetector;
-import org.opencv.engine.MarketConnector;
-import org.opencv.engine.OpenCVEngineInterface;
-import org.opencv.engine.OpenCVEngineService;
-import org.opencv.engine.OpenCVLibraryInfo;
-import org.opencv.engine.R;
+import org.opencv.engine3.HardwareDetector;
+import org.opencv.engine3.MarketConnector;
+import org.opencv.engine3.OpenCVEngineInterface;
+import org.opencv.engine3.OpenCVEngineService;
+import org.opencv.engine3.OpenCVLibraryInfo;
+import org.opencv.engine3.R;
 import android.annotation.TargetApi;
 import android.app.Activity;
 import android.app.AlertDialog;
@@ -161,7 +161,7 @@ public class ManagerActivity extends Activity
         mUpdateEngineButton.setOnClickListener(new OnClickListener() {
 
             public void onClick(View v) {
-                if (!mMarket.InstallAppFromMarket("org.opencv.engine"))
+                if (!mMarket.InstallAppFromMarket("org.opencv.engine3"))
                 {
                     Toast toast = Toast.makeText(getApplicationContext(), "Google Play is not avaliable", Toast.LENGTH_SHORT);
                     toast.show();
@@ -207,7 +207,7 @@ public class ManagerActivity extends Activity
 
             public void onItemClick(AdapterView<?> adapter, View view, int position, long id) {
                 //if (!mListViewItems.get((int) id).get("Name").equals("Built-in OpenCV library"));
-                if (!mInstalledPackageInfo[(int) id].packageName.equals("org.opencv.engine"))
+                if (!mInstalledPackageInfo[(int) id].packageName.equals("org.opencv.engine3"))
                 {
                     mInstalledPackageView.setTag(Integer.valueOf((int)id));
                     mActionDialog.show();
@@ -221,7 +221,7 @@ public class ManagerActivity extends Activity
             public void onReceive(Context context, Intent intent) {
                 Log.d("OpenCVManager/Receiver", "Broadcast message " + intent.getAction() + " receiver");
                 Log.d("OpenCVManager/Receiver", "Filling package list on broadcast message");
-                if (!bindService(new Intent("org.opencv.engine.BIND"),
+                if (!bindService(new Intent("org.opencv.engine3.BIND"),
                      new OpenCVEngineServiceConnection(), Context.BIND_AUTO_CREATE))
                 {
                     TextView EngineVersionView = (TextView)findViewById(R.id.EngineVersionValue);
@@ -350,7 +350,7 @@ public class ManagerActivity extends Activity
                 else
                     NativeLibDir = "/data/data/" + mInstalledPackageInfo[i].packageName + "/lib";
 
-                if (PackageName.equals("org.opencv.engine"))
+                if (PackageName.equals("org.opencv.engine3"))
                 {
                     OpenCVLibraryInfo NativeInfo = new OpenCVLibraryInfo(NativeLibDir);
                     if (NativeInfo.status())
diff --git a/platforms/android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java b/platforms/android/service/engine/src/org/opencv/engine3/manager/PackageListAdapter.java
similarity index 96%
rename from platforms/android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java
rename to platforms/android/service/engine/src/org/opencv/engine3/manager/PackageListAdapter.java
index 17707efd3..4a929ae95 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/manager/PackageListAdapter.java
+++ b/platforms/android/service/engine/src/org/opencv/engine3/manager/PackageListAdapter.java
@@ -1,4 +1,4 @@
-package org.opencv.engine.manager;
+package org.opencv.engine3.manager;
 
 import java.util.List;
 import java.util.Map;
diff --git a/platforms/android/service/engine_test/AndroidManifest.xml b/platforms/android/service/engine_test/AndroidManifest.xml
index 5779d90a6..5ee354225 100644
--- a/platforms/android/service/engine_test/AndroidManifest.xml
+++ b/platforms/android/service/engine_test/AndroidManifest.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="org.opencv.engine.test"
+    package="org.opencv.engine3.test"
     android:versionCode="1"
     android:versionName="1.0" >
 
@@ -8,7 +8,7 @@
 
     <instrumentation
         android:name="android.test.InstrumentationTestRunner"
-        android:targetPackage="org.opencv.engine" />
+        android:targetPackage="org.opencv.engine3" />
 
     <application
         android:icon="@drawable/ic_launcher"
@@ -17,4 +17,4 @@
 
     </application>
 
-</manifest>
\ No newline at end of file
+</manifest>
diff --git a/platforms/android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java b/platforms/android/service/engine_test/src/org/opencv/engine3/test/EngineInterfaceTest.java
similarity index 91%
rename from platforms/android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java
rename to platforms/android/service/engine_test/src/org/opencv/engine3/test/EngineInterfaceTest.java
index d67f22faf..b45af98a3 100644
--- a/platforms/android/service/engine_test/src/org/opencv/engine/test/EngineInterfaceTest.java
+++ b/platforms/android/service/engine_test/src/org/opencv/engine3/test/EngineInterfaceTest.java
@@ -1,7 +1,7 @@
-package org.opencv.engine.test;
+package org.opencv.engine3.test;
 
-import org.opencv.engine.OpenCVEngineInterface;
-import org.opencv.engine.OpenCVEngineService;
+import org.opencv.engine3.OpenCVEngineInterface;
+import org.opencv.engine3.OpenCVEngineService;
 
 import android.content.Intent;
 import android.os.IBinder;
@@ -18,7 +18,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine.BIND"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3.BIND"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);
@@ -28,7 +28,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testInstallVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);
@@ -37,7 +37,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testGetPathForExistVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);
@@ -46,7 +46,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testGetPathForUnExistVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);
@@ -55,7 +55,7 @@ public class EngineInterfaceTest extends ServiceTestCase<OpenCVEngineService>
 
     public void testInstallAndGetVersion() throws RemoteException
     {
-        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine"));
+        IBinder ServiceBinder = bindService(new Intent("org.opencv.engine3"));
         assertNotNull(ServiceBinder);
         OpenCVEngineInterface ServiceObj = OpenCVEngineInterface.Stub.asInterface(ServiceBinder);
         assertNotNull(ServiceObj);

From a362aca7839aa2b9656897b4423e96f5dd255fe1 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Fri, 17 Apr 2015 16:59:52 +0300
Subject: [PATCH 28/48] Rename manager package to OpenCV3 Manager

---
 .../java/generator/src/java/android+AsyncServiceHelper.java   | 4 ++--
 platforms/android/service/engine/build.xml                    | 2 +-
 platforms/android/service/engine/res/values/strings.xml       | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/java/generator/src/java/android+AsyncServiceHelper.java b/modules/java/generator/src/java/android+AsyncServiceHelper.java
index 9fdf5ac1f..9882d60b5 100644
--- a/modules/java/generator/src/java/android+AsyncServiceHelper.java
+++ b/modules/java/generator/src/java/android+AsyncServiceHelper.java
@@ -77,7 +77,7 @@ class AsyncServiceHelper
                 private LoaderCallbackInterface mUserAppCallback = Callback;
                 public String getPackageName()
                 {
-                    return "OpenCV Manager";
+                    return "OpenCV3 Manager";
                 }
                 public void install() {
                     Log.d(TAG, "Trying to install OpenCV Manager via Google Play");
@@ -123,7 +123,7 @@ class AsyncServiceHelper
                 private LoaderCallbackInterface mUserAppCallback = Callback;
                 public String getPackageName()
                 {
-                    return "OpenCV Manager";
+                    return "OpenCV3 Manager";
                 }
                 public void install()
                 {
diff --git a/platforms/android/service/engine/build.xml b/platforms/android/service/engine/build.xml
index 98ddc3eac..47a283d8f 100644
--- a/platforms/android/service/engine/build.xml
+++ b/platforms/android/service/engine/build.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project name="OpenCV Manager" default="help">
+<project name="OpenCV3 Manager" default="help">
 
     <!-- The local.properties file is created and updated by the 'android' tool.
          It contains the path to the SDK. It should *NOT* be checked into
diff --git a/platforms/android/service/engine/res/values/strings.xml b/platforms/android/service/engine/res/values/strings.xml
index a13432623..d05333607 100644
--- a/platforms/android/service/engine/res/values/strings.xml
+++ b/platforms/android/service/engine/res/values/strings.xml
@@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>
 <resources>
-    <string name="app_name">OpenCV Manager</string>
-</resources>
\ No newline at end of file
+    <string name="app_name">OpenCV3 Manager</string>
+</resources>

From b02fc039cd8dae0987644e6579e0a176f93c022d Mon Sep 17 00:00:00 2001
From: Pavel Rojtberg <pavel.rojtberg@igd.fraunhofer.de>
Date: Fri, 17 Apr 2015 17:42:24 +0200
Subject: [PATCH 29/48] fix python and numpy output formatters

---
 modules/core/src/out.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/core/src/out.cpp b/modules/core/src/out.cpp
index 2c6ba5396..343fd0a0e 100644
--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@@ -309,7 +309,7 @@ namespace cv
 
         Ptr<Formatted> format(const Mat& mtx) const
         {
-            char braces[5] = {'[', ']', '\0', '[', ']'};
+            char braces[5] = {'[', ']', ',', '[', ']'};
             if (mtx.cols == 1)
                 braces[0] = braces[1] = '\0';
             return makePtr<FormattedImpl>("[", "]", mtx, &*braces,
@@ -327,11 +327,11 @@ namespace cv
             {
                 "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "uint64"
             };
-            char braces[5] = {'[', ']', '\0', '[', ']'};
+            char braces[5] = {'[', ']', ',', '[', ']'};
             if (mtx.cols == 1)
                 braces[0] = braces[1] = '\0';
             return makePtr<FormattedImpl>("array([",
-                cv::format("], type='%s')", numpyTypes[mtx.depth()]), mtx, &*braces,
+                cv::format("], dtype='%s')", numpyTypes[mtx.depth()]), mtx, &*braces,
                 mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
         }
     };

From 5a552b6d8dc580dd1829eacb0ea9bee4fe4db55a Mon Sep 17 00:00:00 2001
From: Deanna Hood <deanna.m.hood@gmail.com>
Date: Thu, 16 Apr 2015 19:54:53 -0400
Subject: [PATCH 30/48] Regression test for Bug #3989: check fitEllipse with
 rotation angles of n*pi/2

---
 modules/imgproc/test/test_convhull.cpp | 62 +++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index 6b5144f92..e7b2886d3 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -1239,7 +1239,6 @@ void CV_FitEllipseTest::run_func()
         box = (CvBox2D)cv::fitEllipse(cv::cvarrToMat(points));
 }
 
-
 int CV_FitEllipseTest::validate_test_results( int test_case_idx )
 {
     int code = CV_BaseShapeDescrTest::validate_test_results( test_case_idx );
@@ -1354,6 +1353,64 @@ protected:
     }
 };
 
+
+// Regression test for incorrect fitEllipse result reported in Bug #3989
+// Check edge cases for rotation angles of ellipse ([-180, 90, 0, 90, 180] degrees)
+class CV_FitEllipseParallelTest : public CV_FitEllipseTest
+{
+public:
+    CV_FitEllipseParallelTest();
+    ~CV_FitEllipseParallelTest();
+protected:
+    void generate_point_set( void* points );
+    void run_func(void);
+    Mat pointsMat;
+};
+
+CV_FitEllipseParallelTest::CV_FitEllipseParallelTest()
+{
+    min_ellipse_size = 5;
+}
+
+void CV_FitEllipseParallelTest::generate_point_set( void* )
+{
+    RNG& rng = ts->get_rng();
+    int height = (int)(MAX(high.val[0] - low.val[0], min_ellipse_size));
+    int width = (int)(MAX(high.val[1] - low.val[1], min_ellipse_size));
+    const int angle = ( (cvtest::randInt(rng) % 5) - 2 ) * 90;
+    const int dim = max(height, width);
+    const Point center = Point(dim*2, dim*2);
+
+    if( width > height )
+    {
+        int t;
+        CV_SWAP( width, height, t );
+    }
+
+    Mat image = Mat::zeros(dim*4, dim*4, CV_8UC1);
+    ellipse(image, center, Size(height, width), angle,
+            0, 360, Scalar(255, 0, 0), 1, 8);
+
+    box0.center.x = (float)center.x;
+    box0.center.y = (float)center.y;
+    box0.size.width = (float)width*2;
+    box0.size.height = (float)height*2;
+    box0.angle = (float)angle;
+
+    vector<vector<Point> > contours;
+    findContours(image, contours,  RETR_EXTERNAL,  CHAIN_APPROX_NONE);
+    Mat(contours[0]).convertTo(pointsMat, CV_32F);
+}
+
+void CV_FitEllipseParallelTest::run_func()
+{
+    box = (CvBox2D)cv::fitEllipse(pointsMat);
+}
+
+CV_FitEllipseParallelTest::~CV_FitEllipseParallelTest(){
+    pointsMat.release();
+}
+
 /****************************************************************************************\
 *                                   FitLine Test                                         *
 \****************************************************************************************/
@@ -1377,7 +1434,7 @@ protected:
 
 CV_FitLineTest::CV_FitLineTest()
 {
-    min_log_size = 5; // for robust ellipse fitting a dozen of points is needed at least
+    min_log_size = 5; // for robust line fitting a dozen of points is needed at least
     max_log_size = 10;
     max_noise = 0.05;
 }
@@ -1866,6 +1923,7 @@ TEST(Imgproc_MinTriangle, accuracy) { CV_MinTriangleTest test; test.safe_run();
 TEST(Imgproc_MinCircle, accuracy) { CV_MinCircleTest test; test.safe_run(); }
 TEST(Imgproc_ContourPerimeter, accuracy) { CV_PerimeterTest test; test.safe_run(); }
 TEST(Imgproc_FitEllipse, accuracy) { CV_FitEllipseTest test; test.safe_run(); }
+TEST(Imgproc_FitEllipse, parallel) { CV_FitEllipseParallelTest test; test.safe_run(); }
 TEST(Imgproc_FitLine, accuracy) { CV_FitLineTest test; test.safe_run(); }
 TEST(Imgproc_ContourMoments, accuracy) { CV_ContourMomentsTest test; test.safe_run(); }
 TEST(Imgproc_ContourPerimeterSlice, accuracy) { CV_PerimeterAreaSliceTest test; test.safe_run(); }

From 37f77e73977e486ff298735fc764aa614df4cff4 Mon Sep 17 00:00:00 2001
From: Deanna Hood <deanna.m.hood@gmail.com>
Date: Sat, 18 Apr 2015 12:27:41 -0400
Subject: [PATCH 31/48] Change condition on parallel ellipse case so can only
 calculate t if necessary

---
 modules/imgproc/src/shapedescr.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/imgproc/src/shapedescr.cpp b/modules/imgproc/src/shapedescr.cpp
index 65eddfd1a..205aecb01 100644
--- a/modules/imgproc/src/shapedescr.cpp
+++ b/modules/imgproc/src/shapedescr.cpp
@@ -446,9 +446,8 @@ cv::RotatedRect cv::fitEllipse( InputArray _points )
 
     // store angle and radii
     rp[4] = -0.5 * atan2(gfp[2], gfp[1] - gfp[0]); // convert from APP angle usage
-    t = sin(-2.0 * rp[4]);
-    if( fabs(t) > min_eps )
-        t = gfp[2]/t;
+    if( fabs(gfp[2]) > min_eps )
+        t = gfp[2]/sin(-2.0 * rp[4]);
     else // ellipse is rotated by an integer multiple of pi/2
         t = gfp[1] - gfp[0];
     rp[2] = fabs(gfp[0] + gfp[1] - t);

From ca9a2887fc88c1f96b4380febab4d670d2ee9465 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Mon, 20 Apr 2015 20:27:03 +0300
Subject: [PATCH 32/48] extended universal intrinsics: added 64-bit integers,
 improved shift operators, renamed "pack" operations; changed semantics of
 extended integer multiplication, added v_dotprod. also, split intrinsics into
 separate files. C implementation is compiled only when none of SSE2 or NEON
 is available.

---
 modules/hal/include/opencv2/hal/intrin.hpp    | 2628 +----------------
 .../hal/include/opencv2/hal/intrin_cpp.hpp    |  811 +++++
 .../hal/include/opencv2/hal/intrin_neon.hpp   |  823 ++++++
 .../hal/include/opencv2/hal/intrin_sse.hpp    | 1544 ++++++++++
 4 files changed, 3227 insertions(+), 2579 deletions(-)
 create mode 100644 modules/hal/include/opencv2/hal/intrin_cpp.hpp
 create mode 100644 modules/hal/include/opencv2/hal/intrin_neon.hpp
 create mode 100644 modules/hal/include/opencv2/hal/intrin_sse.hpp

diff --git a/modules/hal/include/opencv2/hal/intrin.hpp b/modules/hal/include/opencv2/hal/intrin.hpp
index fc40cfda8..439a04c74 100644
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@@ -45,7 +45,6 @@
 #ifndef __OPENCV_HAL_INTRIN_HPP__
 #define __OPENCV_HAL_INTRIN_HPP__
 
-#include <algorithm>
 #include <cmath>
 #include <float.h>
 #include <stdlib.h>
@@ -55,9 +54,12 @@
 #define OPENCV_HAL_NOP(a) (a)
 #define OPENCV_HAL_1ST(a, b) (a)
 
-namespace cv { namespace hal {
+// unlike HAL API, which is in cv::hall,
+// we put intrinsics into cv namespace to make its
+// access from within opencv code more accessible
+namespace cv {
 
-template<typename _Tp> struct TypeTraits
+template<typename _Tp> struct V_TypeTraits
 {
     typedef _Tp int_type;
     typedef _Tp uint_type;
@@ -71,7 +73,7 @@ template<typename _Tp> struct TypeTraits
     static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
 };
 
-template<> struct TypeTraits<uchar>
+template<> struct V_TypeTraits<uchar>
 {
     typedef uchar value_type;
     typedef schar int_type;
@@ -88,7 +90,7 @@ template<> struct TypeTraits<uchar>
     static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
 };
 
-template<> struct TypeTraits<schar>
+template<> struct V_TypeTraits<schar>
 {
     typedef schar value_type;
     typedef schar int_type;
@@ -105,7 +107,7 @@ template<> struct TypeTraits<schar>
     static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
 };
 
-template<> struct TypeTraits<ushort>
+template<> struct V_TypeTraits<ushort>
 {
     typedef ushort value_type;
     typedef short int_type;
@@ -123,7 +125,7 @@ template<> struct TypeTraits<ushort>
     static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
 };
 
-template<> struct TypeTraits<short>
+template<> struct V_TypeTraits<short>
 {
     typedef short value_type;
     typedef short int_type;
@@ -142,7 +144,7 @@ template<> struct TypeTraits<short>
     static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
 };
 
-template<> struct TypeTraits<unsigned>
+template<> struct V_TypeTraits<unsigned>
 {
     typedef unsigned value_type;
     typedef int int_type;
@@ -150,6 +152,7 @@ template<> struct TypeTraits<unsigned>
     typedef unsigned abs_type;
     typedef unsigned sum_type;
 
+    typedef uint64 w_type;
     typedef ushort nu_type;
 
     static int_type reinterpret_int(value_type x) { return (int_type)x; }
@@ -157,7 +160,7 @@ template<> struct TypeTraits<unsigned>
     static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
 };
 
-template<> struct TypeTraits<int>
+template<> struct V_TypeTraits<int>
 {
     typedef int value_type;
     typedef int int_type;
@@ -165,6 +168,7 @@ template<> struct TypeTraits<int>
     typedef unsigned abs_type;
     typedef int sum_type;
 
+    typedef int64 w_type;
     typedef short n_type;
     typedef ushort nu_type;
 
@@ -173,7 +177,38 @@ template<> struct TypeTraits<int>
     static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
 };
 
-template<> struct TypeTraits<float>
+template<> struct V_TypeTraits<uint64>
+{
+    typedef uint64 value_type;
+    typedef int64 int_type;
+    typedef uint64 uint_type;
+    typedef uint64 abs_type;
+    typedef uint64 sum_type;
+
+    typedef unsigned nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<int64>
+{
+    typedef int64 value_type;
+    typedef int64 int_type;
+    typedef uint64 uint_type;
+    typedef uint64 abs_type;
+    typedef int64 sum_type;
+
+    typedef int nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+
+template<> struct V_TypeTraits<float>
 {
     typedef float value_type;
     typedef int int_type;
@@ -203,7 +238,7 @@ template<> struct TypeTraits<float>
     }
 };
 
-template<> struct TypeTraits<double>
+template<> struct V_TypeTraits<double>
 {
     typedef double value_type;
     typedef int64 int_type;
@@ -230,2587 +265,22 @@ template<> struct TypeTraits<double>
     }
 };
 
-template<typename _Tp, int n> struct v_reg
-{
-    typedef _Tp scalar_type;
-    typedef v_reg<typename TypeTraits<_Tp>::int_type, n> int_vec;
-    typedef v_reg<typename TypeTraits<_Tp>::abs_type, n> abs_vec;
-    enum { channels = n };
-
-    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
-    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
-    {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-    }
-    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
-           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
-           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
-           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
-    {
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
-        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
-        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
-        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
-    }
-
-    v_reg() {}
-    v_reg(const v_reg<_Tp, n> & r)
-    {
-        for( int i = 0; i < n; i++ )
-            s[i] = r.s[i];
-    }
-
-    _Tp get(const int i) const { return s[i]; }
-    _Tp get0() const { return s[0]; }
-    v_reg<_Tp, n> high() const
-    {
-        v_reg<_Tp, n> c;
-        int i;
-        for( i = 0; i < n/2; i++ )
-        {
-            c.s[i] = s[i+(n/2)];
-            c.s[i+(n/2)] = 0;
-        }
-        return c;
-    }
-
-    static v_reg<_Tp, n> zero()
-    {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = (_Tp)0;
-        return c;
-    }
-
-    static v_reg<_Tp, n> all(_Tp s)
-    {
-        v_reg<_Tp, n> c;
-        for( int i = 0; i < n; i++ )
-            c.s[i] = s;
-        return c;
-    }
-
-    template<typename _Tp2, int n2> static v_reg<_Tp2, n2> reinterpret_as(const v_reg<_Tp, n>& a)
-    {
-        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
-        v_reg<_Tp2, n2> c;
-        memcpy(&c.s[0], &a.s[0], bytes);
-        return c;
-    }
-
-    _Tp s[n];
-};
-
-#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return c; \
-} \
-template<typename _Tp, int n> inline v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_BIN_OP(+)
-OPENCV_HAL_IMPL_BIN_OP(-)
-OPENCV_HAL_IMPL_BIN_OP(*)
-OPENCV_HAL_IMPL_BIN_OP(/)
-
-#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    typedef typename TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)(TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return c; \
-} \
-template<typename _Tp, int n> inline v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)(TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_BIT_OP(&)
-OPENCV_HAL_IMPL_BIT_OP(|)
-OPENCV_HAL_IMPL_BIT_OP(^)
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = TypeTraits<_Tp>::reinterpret_from_int(~TypeTraits<_Tp>::reinterpret_int(a.s[i]));
-        return c;
-}
-
-#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
-template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
-{ \
-    v_reg<_Tp2, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename TypeTraits<_Tp>::abs_type)std::abs, typename TypeTraits<_Tp>::abs_type)
-OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
-OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
-
-#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cfunc(a.s[i], b.s[i]); \
-    return c; \
-} \
-template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
-{ \
-    _Tp c = a.s[0]; \
-    for( int i = 1; i < n; i++ ) \
-        c = cfunc(c, a.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min)
-OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max)
-
-template<typename _Tp, int n> inline void v_minmax(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                                                   v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        minval.s[i] = std::min(a.s[i], b.s[i]);
-        maxval.s[i] = std::max(a.s[i], b.s[i]);
-    }
-}
-
-
-#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> \
-    operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename TypeTraits<_Tp>::int_type itype; \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_CMP_OP(<)
-OPENCV_HAL_IMPL_CMP_OP(>)
-OPENCV_HAL_IMPL_CMP_OP(<=)
-OPENCV_HAL_IMPL_CMP_OP(>=)
-OPENCV_HAL_IMPL_CMP_OP(==)
-OPENCV_HAL_IMPL_CMP_OP(!=)
-
-#define OPENCV_HAL_IMPL_ADDSUB_OP(func, bin_op, cast_op, _Tp2) \
-template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef _Tp2 rtype; \
-    v_reg<rtype, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_ADDSUB_OP(v_add_wrap, +, (_Tp), _Tp)
-OPENCV_HAL_IMPL_ADDSUB_OP(v_sub_wrap, -, (_Tp), _Tp)
-OPENCV_HAL_IMPL_ADDSUB_OP(v_absdiff, -, (rtype)std::abs, typename TypeTraits<_Tp>::abs_type)
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = 1.f/std::sqrt(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
-    return c;
-}
-
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                                                            const v_reg<_Tp, n>& c)
-{
-    v_reg<_Tp, n> d;
-    for( int i = 0; i < n; i++ )
-        d.s[i] = a.s[i]*b.s[i] + c.s[i];
-    return d;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_mullo(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (_Tp)(a.s[i]*b.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_mulhi2(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (_Tp)((a.s[i]*b.s[i]*2 + TypeTraits<_Tp>::delta) >> TypeTraits<_Tp>::shift);
-    return c;
-}
-
-#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
-{ \
-    v_reg<_Tp, n> c; \
-    for( int i = 0; i < n; i++ ) \
-        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
-    return c; \
-}
-
-OPENCV_HAL_IMPL_SHIFT_OP(<<)
-OPENCV_HAL_IMPL_SHIFT_OP(>>)
-
-template<typename _Tp, int n> inline typename TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
-{
-    typename TypeTraits<_Tp>::sum_type c = a.s[0];
-    for( int i = 1; i < n; i++ )
-        c += a.s[i];
-    return c;
-}
-
-template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
-{
-    int mask = 0;
-    for( int i = 0; i < n; i++ )
-        mask |= (TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
-    return mask;
-}
-
-template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
-            return false;
-    return true;
-}
-
-template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        if( TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
-            return true;
-    return false;
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
-                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i];
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
-                            v_reg<typename TypeTraits<_Tp>::w_type, n/2>& b0,
-                            v_reg<typename TypeTraits<_Tp>::w_type, n/2>& b1)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        b0.s[i] = a.s[i];
-        b1.s[i] = a.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<typename TypeTraits<_Tp>::int_type, n>
-    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename TypeTraits<_Tp>::int_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = TypeTraits<_Tp>::reinterpret_int(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename TypeTraits<_Tp>::uint_type, n>
-    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
-{
-    v_reg<typename TypeTraits<_Tp>::uint_type, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
-                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
-{
-    int i;
-    for( i = 0; i < n/2; i++ )
-    {
-        b0.s[i*2] = a0.s[i];
-        b0.s[i*2+1] = a1.s[i];
-    }
-    for( ; i < n; i++ )
-    {
-        b1.s[i*2-n] = a0.s[i];
-        b1.s[i*2-n+1] = a1.s[i];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr)
-{
-    return v_reg<_Tp, n>(ptr);
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr)
-{
-    return v_reg<_Tp, n>(ptr);
-}
-
-template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < n/2; i++ )
-    {
-        c.s[i] = loptr[i];
-        c.s[i+n/2] = hiptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr)
-{
-    typedef typename TypeTraits<_Tp>::w_type w_type;
-    v_reg<w_type, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline v_reg<typename
-    TypeTraits<typename TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr)
-{
-    typedef typename TypeTraits<typename TypeTraits<_Tp>::w_type>::w_type w_type;
-    v_reg<w_type, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = ptr[i];
-    }
-    return c;
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
-{
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        a.s[i] = ptr[i3];
-        b.s[i] = ptr[i3+1];
-        c.s[i] = ptr[i3+2];
-    }
-}
-
-template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
-                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
-                                                            v_reg<_Tp, n>& d)
-{
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        a.s[i] = ptr[i4];
-        b.s[i] = ptr[i4+1];
-        c.s[i] = ptr[i4+2];
-        d.s[i] = ptr[i4+3];
-    }
-}
-
-template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
-{
-    int i, i3;
-    for( i = i3 = 0; i < n; i++, i3 += 3 )
-    {
-        ptr[i3] = a.s[i];
-        ptr[i3+1] = b.s[i];
-        ptr[i3+2] = c.s[i];
-    }
-}
-
-template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
-                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
-                                                            const v_reg<_Tp, n>& d)
-{
-    int i, i4;
-    for( i = i4 = 0; i < n; i++, i4 += 4 )
-    {
-        ptr[i4] = a.s[i];
-        ptr[i4+1] = b.s[i];
-        ptr[i4+2] = c.s[i];
-        ptr[i4+3] = d.s[i];
-    }
-}
-
-template<typename _Tp, int n> inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n> inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n> inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < (n/2); i++ )
-        ptr[i] = a.s[i+(n/2)];
-}
-
-template<typename _Tp, int n> inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-        ptr[i] = a.s[i];
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i];
-        c.s[i+(n/2)] = b.s[i];
-    }
-}
-
-template<typename _Tp, int n> inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp, n> c;
-    for( int i = 0; i < (n/2); i++ )
-    {
-        c.s[i] = a.s[i+(n/2)];
-        c.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-}
-
-template<typename _Tp, int n> inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
-                                                      v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
-{
-    for( int i = 0; i < (n/2); i++ )
-    {
-        low.s[i] = a.s[i];
-        low.s[i+(n/2)] = b.s[i];
-        high.s[i] = a.s[i+(n/2)];
-        high.s[i+(n/2)] = b.s[i+(n/2)];
-    }
-}
-
-template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvRound(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvFloor(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = cvCeil(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (int)(a.s[i]);
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
-{
-    v_reg<int, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvRound(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvFloor(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvCeil(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
-{
-    v_reg<int, n> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = cvCeil(a.s[i]);
-        c.s[i+n] = 0;
-    }
-    return c;
-}
-
-template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
-{
-    v_reg<float, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (float)a.s[i];
-    return c;
-}
-
-template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
-{
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
-{
-    v_reg<double, n> c;
-    for( int i = 0; i < n; i++ )
-        c.s[i] = (double)a.s[i];
-    return c;
-}
-
-template<typename _Tp, int n, typename _Tp2> inline v_reg<_Tp2, n*2> v_cvtsat(const v_reg<_Tp, n>& a,
-                                                                              const v_reg<_Tp, n>& b)
-{
-    v_reg<_Tp2, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = saturate_cast<_Tp2>(a.s[i]);
-        c.s[i+n] = saturate_cast<_Tp2>(b.s[i]);
-    }
-    return c;
-}
-
-template<typename _Tp, int n, typename _Tp2> inline v_reg<_Tp2, n*2> v_cvtsat(const v_reg<_Tp, n>& a,
-                                                                              const v_reg<_Tp, n>& b,
-                                                                              int rshift)
-{
-    v_reg<_Tp2, n*2> c;
-    for( int i = 0; i < n; i++ )
-    {
-        c.s[i] = saturate_cast<_Tp2>((a.s[i] + (1<<(rshift-1))) >> rshift);
-        c.s[i+n] = saturate_cast<_Tp2>((b.s[i] + (1<<(rshift-1))) >> rshift);
-    }
-    return c;
-}
-
-template<typename _Tp, int n, typename _Tp2> inline void v_storesat(_Tp2* ptr, const v_reg<_Tp, n>& a)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        ptr[i] = saturate_cast<_Tp2>(a.s[i]);
-    }
-}
-
-template<typename _Tp, int n, typename _Tp2> inline void v_storesat(_Tp2* ptr, const v_reg<_Tp, n>& a, int rshift)
-{
-    for( int i = 0; i < n; i++ )
-    {
-        ptr[i] = saturate_cast<_Tp2>((a.s[i] + (1<<(rshift-1))) >> rshift);
-    }
-}
-
-template<typename _Tp> inline void v_transpose4x4(const v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
-                                                  const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
-                                                  v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
-                                                  v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3)
-{
-    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
-    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
-    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
-    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
 }
 
 #if CV_SSE2
 
-#define CV_SIMD128 1
-#define CV_SIMD128_64F 1
-
-struct v_uint8x16
-{
-    explicit v_uint8x16(__m128i v) : val(v) {}
-    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
-               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
-    {
-        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
-                            (char)v4, (char)v5, (char)v6, (char)v7,
-                            (char)v8, (char)v9, (char)v10, (char)v11,
-                            (char)v12, (char)v13, (char)v14, (char)v15);
-    }
-    uchar get0() const
-    {
-        return (uchar)_mm_cvtsi128_si32(val);
-    }
-
-    __m128i val;
-};
-
-struct v_int8x16
-{
-    explicit v_int8x16(__m128i v) : val(v) {}
-    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
-              schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
-    {
-        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
-                            (char)v4, (char)v5, (char)v6, (char)v7,
-                            (char)v8, (char)v9, (char)v10, (char)v11,
-                            (char)v12, (char)v13, (char)v14, (char)v15);
-    }
-    schar get0() const
-    {
-        return (schar)_mm_cvtsi128_si32(val);
-    }
-
-    __m128i val;
-};
-
-struct v_uint16x8
-{
-    explicit v_uint16x8(__m128i v) : val(v) {}
-    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
-    {
-        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
-                             (short)v4, (short)v5, (short)v6, (short)v7);
-    }
-    ushort get0() const
-    {
-        return (ushort)_mm_cvtsi128_si32(val);
-    }
-
-    __m128i val;
-};
-
-struct v_int16x8
-{
-    explicit v_int16x8(__m128i v) : val(v) {}
-    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
-    {
-        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
-                             (short)v4, (short)v5, (short)v6, (short)v7);
-    }
-    short get0() const
-    {
-        return (short)_mm_cvtsi128_si32(val);
-    }
-    __m128i val;
-};
-
-struct v_uint32x4
-{
-    explicit v_uint32x4(__m128i v) : val(v) {}
-    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
-    {
-        val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
-    }
-    unsigned get0() const
-    {
-        return (unsigned)_mm_cvtsi128_si32(val);
-    }
-    __m128i val;
-};
-
-struct v_int32x4
-{
-    explicit v_int32x4(__m128i v) : val(v) {}
-    v_int32x4(int v0, int v1, int v2, int v3)
-    {
-        val = _mm_setr_epi32(v0, v1, v2, v3);
-    }
-    int get0() const
-    {
-        return _mm_cvtsi128_si32(val);
-    }
-    __m128i val;
-};
-
-struct v_float32x4
-{
-    explicit v_float32x4(__m128 v) : val(v) {}
-    v_float32x4(float v0, float v1, float v2, float v3)
-    {
-        val = _mm_setr_ps(v0, v1, v2, v3);
-    }
-    float get0() const
-    {
-        return _mm_cvtss_f32(val);
-    }
-    __m128 val;
-};
-
-struct v_float64x2
-{
-    explicit v_float64x2(__m128d v) : val(v) {}
-    v_float64x2(double v0, double v1)
-    {
-        val = _mm_setr_pd(v0, v1);
-    }
-    double get0() const
-    {
-        return _mm_cvtsd_f64(val);
-    }
-    __m128d val;
-};
-
-inline v_uint8x16 v_setzero_u8() { return v_uint8x16(_mm_setzero_si128()); }
-inline v_int8x16 v_setzero_s8() { return v_int8x16(_mm_setzero_si128()); }
-inline v_uint16x8 v_setzero_u16() { return v_uint16x8(_mm_setzero_si128()); }
-inline v_int16x8 v_setzero_s16() { return v_int16x8(_mm_setzero_si128()); }
-inline v_uint32x4 v_setzero_u32() { return v_uint32x4(_mm_setzero_si128()); }
-inline v_int32x4 v_setzero_s32() { return v_int32x4(_mm_setzero_si128()); }
-inline v_float32x4 v_setzero_f32() { return v_float32x4(_mm_setzero_ps()); }
-inline v_float64x2 v_setzero_f64() { return v_float64x2(_mm_setzero_pd()); }
-
-inline v_uint8x16 v_setall_u8(uchar v) { return v_uint8x16(_mm_set1_epi8((char)v)); }
-inline v_int8x16 v_setall_s8(schar v) { return v_int8x16(_mm_set1_epi8((char)v)); }
-inline v_uint16x8 v_setall_u16(ushort v) { return v_uint16x8(_mm_set1_epi16((short)v)); }
-inline v_int16x8 v_setall_s16(short v) { return v_int16x8(_mm_set1_epi16((short)v)); }
-inline v_uint32x4 v_setall_u32(unsigned v) { return v_uint32x4(_mm_set1_epi32((int)v)); }
-inline v_int32x4 v_setall_s32(int v) { return v_int32x4(_mm_set1_epi32(v)); }
-inline v_float32x4 v_setall_f32(float v) { return v_float32x4(_mm_set1_ps(v)); }
-inline v_float64x2 v_setall_f64(double v) { return v_float64x2(_mm_set1_pd(v)); }
-
-template<typename _Tpvec> inline v_uint8x16 v_reinterpret_as_u8(const _Tpvec& a)
-{ return v_uint8x16(a.val); }
-
-inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& a)
-{ return v_uint8x16(_mm_castps_si128(a.val)); }
-
-inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& a)
-{ return v_uint8x16(_mm_castpd_si128(a.val)); }
-
-template<typename _Tpvec> inline v_int8x16 v_reinterpret_as_s8(const _Tpvec& a)
-{ return v_int8x16(a.val); }
-
-inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& a)
-{ return v_int8x16(_mm_castps_si128(a.val)); }
-
-inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& a)
-{ return v_int8x16(_mm_castpd_si128(a.val)); }
-
-template<typename _Tpvec> inline v_uint16x8 v_reinterpret_as_u16(const _Tpvec& a)
-{ return v_uint16x8(a.val); }
-
-inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& a)
-{ return v_uint16x8(_mm_castps_si128(a.val)); }
-
-inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& a)
-{ return v_uint16x8(_mm_castpd_si128(a.val)); }
-
-template<typename _Tpvec> inline v_int16x8 v_reinterpret_as_s16(const _Tpvec& a)
-{ return v_int16x8(a.val); }
-
-inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& a)
-{ return v_int16x8(_mm_castps_si128(a.val)); }
-
-inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& a)
-{ return v_int16x8(_mm_castpd_si128(a.val)); }
-
-template<typename _Tpvec> inline v_uint32x4 v_reinterpret_as_u32(const _Tpvec& a)
-{ return v_uint32x4(a.val); }
-
-inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& a)
-{ return v_uint32x4(_mm_castps_si128(a.val)); }
-
-inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& a)
-{ return v_uint32x4(_mm_castpd_si128(a.val)); }
-
-template<typename _Tpvec> inline v_int32x4 v_reinterpret_as_s32(const _Tpvec& a)
-{ return v_int32x4(a.val); }
-
-inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& a)
-{ return v_int32x4(_mm_castps_si128(a.val)); }
-
-inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& a)
-{ return v_int32x4(_mm_castpd_si128(a.val)); }
-
-template<typename _Tpvec> inline v_float32x4 v_reinterpret_as_f32(const _Tpvec& a)
-{ return v_float32x4(_mm_castsi128_ps(a.val)); }
-
-inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a)
-{ return v_float32x4(_mm_castpd_ps(a.val)); }
-
-template<typename _Tpvec> inline v_float64x2 v_reinterpret_as_f64(const _Tpvec& a)
-{ return v_float64x2(_mm_castsi128_pd(a.val)); }
-
-inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a)
-{ return v_float64x2(_mm_castps_pd(a.val)); }
-
-inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b)
-{
-    __m128i delta = _mm_set1_epi16(255);
-    return v_uint8x16(_mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta),
-                                       _mm_adds_epu16(_mm_subs_epu16(b.val, delta), delta)));
-}
-inline v_uint8x16 v_shiftn_u16(const v_uint16x8& a, const v_uint16x8& b, int n)
-{
-    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
-    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
-    return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n),
-                                       _mm_srli_epi16(_mm_add_epi16(b.val, delta), n)));
-}
-
-inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b)
-{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
-inline v_uint8x16 v_shiftun_s16(const v_int16x8& a, const v_int16x8& b, int n)
-{
-    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
-    return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n),
-                                       _mm_srai_epi16(_mm_add_epi16(b.val, delta), n)));
-}
-
-inline void v_storen_u16(uchar* ptr, const v_uint16x8& a)
-{
-    __m128i delta = _mm_set1_epi16(255);
-    _mm_storel_epi64((__m128i*)ptr,
-                     _mm_packus_epi16(_mm_adds_epu16(_mm_subs_epu16(a.val, delta), delta), delta));
-}
-
-inline void v_shiftstoren_u16(uchar* ptr, const v_uint16x8& a, int n)
-{
-    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
-    _mm_storel_epi64((__m128i*)ptr,
-                     _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a.val, delta), n), delta));
-}
-
-inline void v_storeun_s16(uchar* ptr, const v_int16x8& a)
-{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
-
-inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& a, int n)
-{
-    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
-    _mm_storel_epi64((__m128i*)ptr,
-        _mm_packus_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta));
-}
-
-inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b)
-{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
-
-inline v_int8x16 v_shiftn_s16(const v_int16x8& a, const v_int16x8& b, int n)
-{
-    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
-    return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n),
-                                       _mm_srai_epi16(_mm_add_epi16(b.val, delta), n)));
-}
-
-inline void v_storen_s16(schar* ptr, const v_int16x8& a)
-{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
-
-inline void v_shiftstoren_s16(schar* ptr, const v_int16x8& a, int n)
-{
-    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
-    _mm_storel_epi64((__m128i*)ptr,
-                     _mm_packs_epi16(_mm_srai_epi16(_mm_add_epi16(a.val, delta), n), delta));
-}
-
-// bit-wise "mask ? a : b"
-inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
-{
-    return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
-}
-
-inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b)
-{
-    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
-    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
-    __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
-    __m128i r = _mm_packs_epi32(a1, b1);
-    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
-}
-inline v_uint16x8 v_shiftn_u32(const v_uint32x4& a, const v_uint32x4& b, int n)
-{
-    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
-    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
-    __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
-    return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
-}
-inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b)
-{
-    __m128i delta32 = _mm_set1_epi32(32768);
-    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
-    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
-}
-inline v_uint16x8 v_shiftun_s32(const v_int32x4& a, const v_int32x4& b, int n)
-{
-    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
-    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
-    __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
-    return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
-}
-
-inline void v_storen_u32(ushort* ptr, const v_uint32x4& a)
-{
-    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
-    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
-    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
-    _mm_storel_epi64((__m128i*)ptr, r);
-}
-inline void v_shiftstoren_u32(ushort* ptr, const v_uint32x4& a, int n)
-{
-    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
-    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
-    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
-    _mm_storel_epi64((__m128i*)ptr, r);
-}
-inline void v_storeun_s32(ushort* ptr, const v_int32x4& a)
-{
-    __m128i delta32 = _mm_set1_epi32(32768);
-    __m128i a1 = _mm_sub_epi32(a.val, delta32);
-    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
-    _mm_storel_epi64((__m128i*)ptr, r);
-}
-inline void v_shiftstoreun_s32(ushort* ptr, const v_int32x4& a, int n)
-{
-    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
-    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
-    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, delta32), _mm_set1_epi16(-32768));
-    _mm_storel_epi64((__m128i*)ptr, r);
-}
-
-inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b)
-{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
-inline v_int16x8 v_shiftn_s32(const v_int32x4& a, const v_int32x4& b, int n)
-{
-    __m128i delta = _mm_set1_epi32(1 << (n-1));
-    return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
-                                     _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
-}
-
-inline void v_storen_s32(short* ptr, const v_int32x4& a)
-{
-    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
-}
-inline void v_shiftstoren_s32(short* ptr, const v_int32x4& a, int n)
-{
-    __m128i delta = _mm_set1_epi32(1 << (n-1));
-    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
-    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
-}
-
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
-{
-    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
-    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
-    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
-    __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
-
-    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
-}
-
-
-#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
-    { \
-        return _Tpvec(intrin(a.val, b.val)); \
-    } \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-    { \
-        a.val = intrin(a.val, b.val); \
-        return a; \
-    }
-
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
-
-inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
-{
-    __m128i c0 = _mm_mul_epu32(a.val, b.val);
-    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
-    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
-    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
-    return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
-}
-inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
-{
-    __m128i c0 = _mm_mul_epu32(a.val, b.val);
-    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
-    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
-    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
-    return v_int32x4(_mm_unpacklo_epi64(d0, d1));
-}
-
-#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
-    inline _Tpvec operator ~ (const _Tpvec& a) \
-    { \
-        return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
-    }
-
-OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
-OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
-OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
-OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
-OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
-OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
-OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
-OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
-
-inline v_float32x4 v_sqrt(const v_float32x4& x)
-{ return v_float32x4(_mm_sqrt_ps(x.val)); }
-
-inline v_float32x4 v_invsqrt(const v_float32x4& x)
-{
-    static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
-    __m128 t = x.val;
-    __m128 h = _mm_mul_ps(t, _0_5);
-    t = _mm_rsqrt_ps(t);
-    t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
-    return v_float32x4(t);
-}
-
-inline v_float64x2 v_sqrt(const v_float64x2& x)
-{ return v_float64x2(_mm_sqrt_pd(x.val)); }
-
-inline v_float64x2 v_invsqrt(const v_float64x2& x)
-{
-    static const __m128d v_1 = _mm_set1_pd(1.);
-    return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
-}
-
-inline v_float32x4 v_abs(const v_float32x4& x)
-{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
-inline v_float64x2 v_abs(const v_float64x2& x)
-{
-    return v_float64x2(_mm_and_pd(x.val,
-        _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
-}
-
-// TODO: exp, log, sin, cos
-
-#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
-inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(intrin(a.val, b.val)); \
-}
-
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
-
-inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
-{
-    __m128i delta = _mm_set1_epi8((char)-128);
-    return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
-                                                       _mm_xor_si128(b.val, delta))));
-}
-inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
-{
-    __m128i delta = _mm_set1_epi8((char)-128);
-    return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
-                                                       _mm_xor_si128(b.val, delta))));
-}
-inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
-{
-    return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
-}
-inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
-{
-    return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
-}
-inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
-{
-    __m128i delta = _mm_set1_epi32((int)0x80000000);
-    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
-    return v_uint32x4(v_select_si128(mask, b.val, a.val));
-}
-inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
-{
-    __m128i delta = _mm_set1_epi32((int)0x80000000);
-    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
-    return v_uint32x4(v_select_si128(mask, a.val, b.val));
-}
-inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
-{
-    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
-}
-inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
-{
-    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
-}
-
-#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
-inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
-{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
-{ \
-    __m128i not_mask = _mm_set1_epi32(-1); \
-    return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
-} \
-inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
-{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
-{ \
-    __m128i not_mask = _mm_set1_epi32(-1); \
-    return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
-} \
-inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
-{ \
-    __m128i smask = _mm_set1_##suffix(sbit); \
-    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
-} \
-inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
-{ \
-    __m128i smask = _mm_set1_##suffix(sbit); \
-    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
-} \
-inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
-{ \
-    __m128i smask = _mm_set1_##suffix(sbit); \
-    __m128i not_mask = _mm_set1_epi32(-1); \
-    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
-    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
-} \
-inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
-{ \
-    __m128i smask = _mm_set1_##suffix(sbit); \
-    __m128i not_mask = _mm_set1_epi32(-1); \
-    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
-    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
-} \
-inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
-{ \
-    return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
-} \
-inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
-{ \
-    return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
-} \
-inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
-{ \
-    __m128i not_mask = _mm_set1_epi32(-1); \
-    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
-} \
-inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
-{ \
-    __m128i not_mask = _mm_set1_epi32(-1); \
-    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
-}
-
-OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
-OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
-OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
-
-#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
-
-OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
-OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
-
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
-
-#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
-inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
-{ \
-    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
-} \
-inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
-{ \
-    __m128i smask = _mm_set1_epi32(smask32); \
-    __m128i a1 = _mm_xor_si128(a.val, smask); \
-    __m128i b1 = _mm_xor_si128(b.val, smask); \
-    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
-}
-
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
-OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
-
-#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
-inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
-    return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
-} \
-inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
-    return _Tpvec(_mm_sqrt_##suffix(res)); \
-} \
-inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
-    return _Tpvec(res); \
-} \
-inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
-{ \
-    return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
-}
-
-OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
-OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
-
-#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
-inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
-{ \
-    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
-} \
-inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
-{ \
-    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
-} \
-inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
-{ \
-    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
-} \
-inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
-{ \
-    return _Tpsvec(_mm_srai_##suffix(a.val, imm)); \
-}
-
-OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16)
-OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32)
-
-inline v_int16x8 v_mullo(const v_int16x8& a, const v_int16x8& b)
-{
-    return v_int16x8(_mm_mullo_epi16(a.val, b.val));
-}
-inline v_uint16x8 v_mullo(const v_uint16x8& a, const v_uint16x8& b)
-{
-    return v_uint16x8(_mm_mullo_epi16(a.val, b.val));
-}
-inline v_int16x8 v_mulhi2(const v_int16x8& a, const v_int16x8& b)
-{
-    return v_int16x8(_mm_slli_epi16(_mm_mulhi_epi16(a.val, b.val), 1));
-}
-inline v_uint16x8 v_mulhi2(const v_uint16x8& a, const v_uint16x8& b)
-{
-    return v_uint16x8(_mm_slli_epi16(_mm_mulhi_epu16(a.val, b.val), 1));
-}
-
-#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
-inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
-inline _Tpvec v_load_aligned(const _Tp* ptr) \
-{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
-inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
-{ \
-    return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
-                                     _mm_loadl_epi64((const __m128i*)ptr1))); \
-} \
-inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
-inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
-{ _mm_store_si128((__m128i*)ptr, a.val); } \
-inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
-{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
-inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
-{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
-
-OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
-OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
-OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
-OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
-
-#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
-inline _Tpvec v_load_aligned(const _Tp* ptr) \
-{ return _Tpvec(_mm_load_##suffix(ptr)); } \
-inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
-{ \
-    return _Tpvec(_mm_castsi128_##suffix( \
-        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
-                           _mm_loadl_epi64((const __m128i*)ptr1)))); \
-} \
-inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ _mm_storeu_##suffix(ptr, a.val); } \
-inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
-{ _mm_store_##suffix(ptr, a.val); } \
-inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
-{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
-inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
-{ \
-    __m128i a1 = _mm_cast##suffix##_si128(a.val); \
-    _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
-}
-
-OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
-OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
-
-#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
-inline scalartype v_reduce_##func(const _Tpvec& a) \
-{ \
-    scalartype CV_DECL_ALIGNED(16) buf[4]; \
-    v_store_aligned(buf, a); \
-    scalartype s0 = scalar_func(buf[0], buf[1]); \
-    scalartype s1 = scalar_func(buf[2], buf[3]); \
-    return scalar_func(s0, s1); \
-}
-
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
-OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
-
-#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
-inline int v_signmask(const _Tpvec& a) \
-{ \
-    return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
-} \
-inline bool v_check_all(const _Tpvec& a) \
-{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
-inline bool v_check_any(const _Tpvec& a) \
-{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
-
-#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
-inline __m128i v_packq_epi32(__m128i a)
-{
-    __m128i b = _mm_packs_epi32(a, a);
-    return _mm_packs_epi16(b, b);
-}
-
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
-OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
-
-#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
-inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
-}
-
-OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
-OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
-OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
-
-#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
-inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
-{ \
-    __m128i z = _mm_setzero_si128(); \
-    b0.val = _mm_unpacklo_##suffix(a.val, z); \
-    b1.val = _mm_unpackhi_##suffix(a.val, z); \
-} \
-inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
-{ \
-    __m128i z = _mm_setzero_si128(); \
-    return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
-} \
-inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
-{ \
-    b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
-    b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
-} \
-inline _Tpwsvec v_load_expand(const _Tps* ptr) \
-{ \
-    __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
-    return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
-}
-
-OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
-OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
-
-inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{
-    __m128i z = _mm_setzero_si128();
-    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
-    return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
-}
-
-inline v_int32x4 v_load_expand_q(const schar* ptr)
-{
-    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
-    a = _mm_unpacklo_epi8(a, a);
-    a = _mm_unpacklo_epi8(a, a);
-    return v_int32x4(_mm_srai_epi32(a, 24));
-}
-
-#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
-inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
-{ \
-    b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
-    b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
-} \
-inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
-    return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
-} \
-inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
-    return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
-} \
-inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
-{ \
-    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
-    c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
-    d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
-}
-
-OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
-OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
-
-inline v_int32x4 v_round(const v_float32x4& a)
-{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
-
-inline v_int32x4 v_floor(const v_float32x4& a)
-{
-    __m128i a1 = _mm_cvtps_epi32(a.val);
-    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
-    return v_int32x4(_mm_add_epi32(a1, mask));
-}
-
-inline v_int32x4 v_ceil(const v_float32x4& a)
-{
-    __m128i a1 = _mm_cvtps_epi32(a.val);
-    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
-    return v_int32x4(_mm_sub_epi32(a1, mask));
-}
-
-inline v_int32x4 v_trunc(const v_float32x4& a)
-{ return v_int32x4(_mm_cvttps_epi32(a.val)); }
-
-inline v_int32x4 v_round(const v_float64x2& a)
-{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
-
-inline v_int32x4 v_floor(const v_float64x2& a)
-{
-    __m128i a1 = _mm_cvtpd_epi32(a.val);
-    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
-    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
-    return v_int32x4(_mm_add_epi32(a1, mask));
-}
-
-inline v_int32x4 v_ceil(const v_float64x2& a)
-{
-    __m128i a1 = _mm_cvtpd_epi32(a.val);
-    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
-    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
-    return v_int32x4(_mm_sub_epi32(a1, mask));
-}
-
-inline v_int32x4 v_trunc(const v_float64x2& a)
-{ return v_int32x4(_mm_cvttpd_epi32(a.val)); }
-
-#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
-inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
-                           const _Tpvec& a2, const _Tpvec& a3, \
-                           _Tpvec& b0, _Tpvec& b1, \
-                           _Tpvec& b2, _Tpvec& b3) \
-{ \
-    __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
-    __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
-    __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
-    __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
-\
-    b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
-    b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
-    b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
-    b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
-}
-
-OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
-OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
-
-#if 0
-inline void v_load_deinterleave(const uchar*, v_uint8x16&, v_uint8x16&, v_uint8x16&)
-{
-    // !!! TODO !!!
-}
-#endif
-
-inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
-{
-    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
-    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
-    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
-    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
-
-    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
-    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
-    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
-    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
-
-    u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
-    u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
-    u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
-    u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
-
-    v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
-    v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
-    v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ...
-    v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ...
-
-    a.val = _mm_unpacklo_epi8(v0, v1);
-    b.val = _mm_unpacklo_epi8(v2, v3);
-    c.val = _mm_unpackhi_epi8(v0, v1);
-    d.val = _mm_unpacklo_epi8(v2, v3);
-}
-
-#if 0
-inline void v_load_deinterleave(const ushort*, v_uint16x8&, v_uint16x8&, v_uint16x8&)
-{
-    // !!! TODO !!!
-}
-#endif
-
-inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
-{
-    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
-    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
-    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
-    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
-
-    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
-    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
-    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
-    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
-
-    u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
-    u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
-    u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
-    u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
-
-    a.val = _mm_unpacklo_epi16(u0, u1);
-    b.val = _mm_unpackhi_epi16(u0, u1);
-    c.val = _mm_unpacklo_epi16(u2, u3);
-    d.val = _mm_unpackhi_epi16(u2, u3);
-}
-
-#if 0
-inline void v_load_deinterleave(const unsigned*, v_uint32x4&, v_uint32x4&, v_uint32x4&)
-{
-    // !!! TODO !!!
-}
-#endif
-
-inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
-{
-    v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
-    v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
-    v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
-    v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
-
-    v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
-}
-
-inline void v_load_deinterleave(const float*, v_float32x4&, v_float32x4&, v_float32x4&)
-{
-    // !!! TODO !!!
-}
-
-inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
-{
-    v_float32x4 u0(_mm_loadu_ps(ptr));
-    v_float32x4 u1(_mm_loadu_ps(ptr + 4));
-    v_float32x4 u2(_mm_loadu_ps(ptr + 8));
-    v_float32x4 u3(_mm_loadu_ps(ptr + 12));
-
-    v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
-}
-
-inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
-                                const v_uint8x16& c, const v_uint8x16& d)
-{
-    // a0 a1 a2 a3 ....
-    // b0 b1 b2 b3 ....
-    // c0 c1 c2 c3 ....
-    // d0 d1 d2 d3 ....
-    __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
-    __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
-    __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
-    __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
-
-    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
-    __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
-    __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
-    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
-
-    _mm_storeu_si128((__m128i*)ptr, v0);
-    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
-    _mm_storeu_si128((__m128i*)(ptr + 32), v1);
-    _mm_storeu_si128((__m128i*)(ptr + 48), v3);
-}
-
-inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
-                                const v_uint16x8& c, const v_uint16x8& d)
-{
-    // a0 a1 a2 a3 ....
-    // b0 b1 b2 b3 ....
-    // c0 c1 c2 c3 ....
-    // d0 d1 d2 d3 ....
-    __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
-    __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
-    __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
-    __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
-
-    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
-    __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
-    __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
-    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
-
-    _mm_storeu_si128((__m128i*)ptr, v0);
-    _mm_storeu_si128((__m128i*)(ptr + 8), v2);
-    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
-    _mm_storeu_si128((__m128i*)(ptr + 24), v3);
-}
-
-inline v_float32x4 v_cvt_f32(const v_int32x4& a)
-{
-    return v_float32x4(_mm_cvtepi32_ps(a.val));
-}
-
-inline v_float32x4 v_cvt_f32(const v_float64x2& a)
-{
-    return v_float32x4(_mm_cvtpd_ps(a.val));
-}
-
-inline v_float64x2 v_cvt_f64(const v_int32x4& a)
-{
-    return v_float64x2(_mm_cvtepi32_pd(a.val));
-}
-
-inline v_float64x2 v_cvt_f64(const v_float32x4& a)
-{
-    return v_float64x2(_mm_cvtps_pd(a.val));
-}
+#include "opencv2/hal/intrin_sse.hpp"
 
 #elif CV_NEON
 
-#define CV_SIMD128 1
-
-struct v_uint8x16
-{
-    explicit v_uint8x16(uint8x16_t v) : val(v) {}
-    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
-               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
-    {
-        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = vld1q_u8(v);
-    }
-    uchar get0() const
-    {
-        return vgetq_lane_u8(val, 0);
-    }
-
-    uint8x16_t val;
-};
-
-struct v_int8x16
-{
-    explicit v_int8x16(int8x16_t v) : val(v) {}
-    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
-               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
-    {
-        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = vld1q_s8(v);
-    }
-    schar get0() const
-    {
-        return vgetq_lane_s8(val, 0);
-    }
-
-    int8x16_t val;
-};
-
-struct v_uint16x8
-{
-    explicit v_uint16x8(uint16x8_t v) : val(v) {}
-    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
-    {
-        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = vld1q_u16(v);
-    }
-    ushort get0() const
-    {
-        return vgetq_lane_u16(val, 0);
-    }
-
-    uint16x8_t val;
-};
-
-struct v_int16x8
-{
-    explicit v_int16x8(int16x8_t v) : val(v) {}
-    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
-    {
-        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = vld1q_s16(v);
-    }
-    short get0() const
-    {
-        return vgetq_lane_s16(val, 0);
-    }
-
-    int16x8_t val;
-};
-
-struct v_uint32x4
-{
-    explicit v_uint32x4(uint32x4_t v) : val(v) {}
-    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
-    {
-        unsigned v[] = {v0, v1, v2, v3};
-        val = vld1q_u32(v);
-    }
-    unsigned get0() const
-    {
-        return vgetq_lane_u32(val, 0);
-    }
-
-    uint32x4_t val;
-};
-
-struct v_int32x4
-{
-    explicit v_int32x4(int32x4_t v) : val(v) {}
-    v_int32x4(int v0, int v1, int v2, int v3)
-    {
-        int v[] = {v0, v1, v2, v3};
-        val = vld1q_s32(v);
-    }
-    int get0() const
-    {
-        return vgetq_lane_s32(val, 0);
-    }
-    int32x4_t val;
-};
-
-struct v_float32x4
-{
-    explicit v_float32x4(float32x4_t v) : val(v) {}
-    v_float32x4(float v0, float v1, float v2, float v3)
-    {
-        float v[] = {v0, v1, v2, v3};
-        val = vld1q_f32(v);
-    }
-    float get0() const
-    {
-        return vgetq_lane_f32(val, 0);
-    }
-    float32x4_t val;
-};
-
-typedef v_reg<double, 2> v_float64x2;
-typedef v_reg<double, 4> v_float64x4;
-
-#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
-inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
-inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
-inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
-inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
-inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
-inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
-inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
-inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
-inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
-inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
-
-OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
-OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
-OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
-OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
-
-inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b)
-{
-    uint8x8_t a1 = vqmovn_u16(a.val), b1 = vqmovn_u16(b.val);
-    return v_uint8x16(vcombine_u8(a1, b1));
-}
-inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b)
-{
-    uint8x8_t a1 = vqmovun_s16(a.val), b1 = vqmovun_s16(b.val);
-    return v_uint8x16(vcombine_u8(a1, b1));
-}
-inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b)
-{
-    int8x8_t a1 = vqmovn_s16(a.val), b1 = vqmovn_s16(b.val);
-    return v_int8x16(vcombine_s8(a1, b1));
-}
-inline void v_storen_u16(uchar* ptr, const v_uint16x8& a) { vst1_u8(ptr, vqmovn_u16(a.val)); }
-inline void v_storeun_s16(uchar* ptr, const v_int16x8& a) { vst1_u8(ptr, vqmovun_s16(a.val)); }
-inline void v_storen_s16(schar* ptr, const v_int16x8& a) { vst1_s8(ptr, vqmovn_s16(a.val)); }
-
-inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b)
-{
-    uint16x4_t a1 = vqmovn_u32(a.val), b1 = vqmovn_u32(b.val);
-    return v_uint16x8(vcombine_u16(a1, b1));
-}
-inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b)
-{
-    uint16x4_t a1 = vqmovun_s32(a.val), b1 = vqmovun_s32(b.val);
-    return v_uint16x8(vcombine_u16(a1, b1));
-}
-inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b)
-{
-    int16x4_t a1 = vqmovn_s32(a.val), b1 = vqmovn_s32(b.val);
-    return v_int16x8(vcombine_s16(a1, b1));
-}
-inline void v_storen_u32(ushort* ptr, const v_uint32x4& a) { vst1_u16(ptr, vqmovn_u32(a.val)); }
-inline void v_storeun_s32(ushort* ptr, const v_int32x4& a) { vst1_u16(ptr, vqmovun_s32(a.val)); }
-inline void v_storen_s32(short* ptr, const v_int32x4& a) { vst1_s16(ptr, vqmovn_s32(a.val)); }
-
-#define v_shiftn_u16(a, b, n) v_uint8x16(vcombine_u8(vqrshrn_n_u16((a).val, (n)), vqrshrn_n_u16((b).val, (n))))
-#define v_shiftn_s16(a, b, n) v_int8x16(vcombine_s8(vqrshrn_n_s16((a).val, (n)), vqrshrn_n_s16((b).val, (n))))
-#define v_shiftn_u32(a, b, n) v_uint16x8(vcombine_u16(vqrshrn_n_u32((a).val, (n)), vqrshrn_n_u32((b).val, (n))))
-#define v_shiftn_s32(a, b, n) v_int16x8(vcombine_s16(vqrshrn_n_s32((a).val, (n)), vqrshrn_n_s32((b).val, (n))))
-#define v_shiftun_s16(a, b, n) v_uint8x16(vcombine_u8(vqrshrun_n_s16((a).val, (n)), vqrshrun_n_s16((b).val, (n))))
-#define v_shiftun_s32(a, b, n) v_uint16x8(vcombine_u16(vqrshrun_n_s32((a).val, (n)), vqrshrun_n_s32((b).val, (n))))
-#define v_shiftstoren_u16(a, n) vst1_u8(vqrshrn_n_u16((a).val, (n)))
-#define v_shiftstoren_s16(a, n) vst1_s8(vqrshrn_n_s16((a).val, (n)))
-#define v_shiftstoreun_s16(a, n) vst1_u8(vqrshrun_n_s16((a).val, (n)))
-#define v_shiftstoren_u32(a, n) vst1_u16(vqrshrn_n_u32((a).val, (n)))
-#define v_shiftstoren_s32(a, n) vst1_s16(vqrshrn_n_s32((a).val, (n)))
-#define v_shiftstoreun_s32(a, n) vst1_u16(vqrshrun_n_s32((a).val, (n)))
-
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
-{
-    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
-    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
-    res = vmlaq_lane_f32(res, m1.val, vl, 1);
-    res = vmlaq_lane_f32(res, m2.val, vh, 0);
-    res = vmlaq_lane_f32(res, m3.val, vh, 1);
-    return v_float32x4(res);
-}
-
-#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
-
-inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
-{
-    float32x4_t reciprocal = vrecpeq_f32(b.val);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    return v_float32x4(vmulq_f32(a.val, reciprocal));
-}
-inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
-{
-    float32x4_t reciprocal = vrecpeq_f32(b.val);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    a.val = vmulq_f32(a.val, reciprocal);
-    return a;
-}
-
-#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
-    inline _Tpvec operator ~ (const _Tpvec& a) \
-    { \
-        return _Tpvec(vmvnq_##suffix(a.val)); \
-    }
-
-OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
-OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
-OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
-OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
-OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
-OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
-
-#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
-{ \
-    return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
-} \
-inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
-{ \
-    a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
-
-inline v_float32x4 operator ~ (const v_float32x4& a)
-{
-    return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
-}
-
-inline v_float32x4 v_sqrt(const v_float32x4& x)
-{
-    float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
-    float32x4_t e = vrsqrteq_f32(x1);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
-    return v_float32x4(vmulq_f32(x.val, e));
-}
-
-inline v_float32x4 v_invsqrt(const v_float32x4& x)
-{
-    float32x4_t e = vrsqrteq_f32(x.val);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
-    return v_float32x4(e);
-}
-
-inline v_float32x4 v_abs(v_float32x4 x)
-{ return v_float32x4(vabsq_f32(x.val)); }
-
-// TODO: exp, log, sin, cos
-
-#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
-inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(intrin(a.val, b.val)); \
-}
-
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
-
-
-#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
-
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
-
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
-
-// TODO: absdiff for signed integers
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
-
-inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
-{
-    v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
-    return v_sqrt(x);
-}
-
-inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
-{
-    return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
-}
-
-inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
-{
-    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
-}
-
-// trade efficiency for convenience
-#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
-{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
-{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); }
-
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
-OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
-
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mullo, vmulq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mullo, vmulq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mulhi2, vqrdmulhq_s16)
-
-#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec(vld1q_##suffix(ptr)); } \
-inline _Tpvec v_load_aligned(const _Tp* ptr) \
-{ return _Tpvec(vld1q_##suffix(ptr)); } \
-inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
-{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
-inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ vst1q_##suffix(ptr, a.val); } \
-inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
-{ vst1q_##suffix(ptr, a.val); } \
-inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
-{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
-inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
-{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
-
-OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
-
-#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
-inline scalartype v_reduce_##func(const _Tpvec& a) \
-{ \
-    scalartype CV_DECL_ALIGNED(16) buf[4]; \
-    v_store_aligned(buf, a); \
-    scalartype s0 = scalar_func(buf[0], buf[1]); \
-    scalartype s1 = scalar_func(buf[2], buf[3]); \
-    return scalar_func(s0, s1); \
-}
-
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
-OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
-
-inline int v_signmask(const v_uint8x16& a)
-{
-    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
-    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
-    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
-    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
-}
-inline int v_signmask(const v_int8x16& a)
-{ return v_signmask(v_reinterpret_as_u8(a)); }
-
-inline int v_signmask(const v_uint16x8& a)
-{
-    int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
-    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
-    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
-    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
-}
-inline int v_signmask(const v_int16x8& a)
-{ return v_signmask(v_reinterpret_as_u16(a)); }
-
-inline int v_signmask(const v_uint32x4& a)
-{
-    int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
-    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
-    uint64x2_t v1 = vpaddlq_u32(v0);
-    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
-}
-inline int v_signmask(const v_int32x4& a)
-{ return v_signmask(v_reinterpret_as_u32(a)); }
-inline int v_signmask(const v_float32x4& a)
-{ return v_signmask(v_reinterpret_as_u32(a)); }
-
-#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
-inline bool v_check_all(const v_##_Tpvec& a) \
-{ \
-    _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
-    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
-    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
-} \
-inline bool v_check_any(const v_##_Tpvec& a) \
-{ \
-    _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
-    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
-    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
-}
-
-OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
-OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
-OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
-
-inline bool v_check_all(const v_int8x16& a)
-{ return v_check_all(v_reinterpret_as_u8(a)); }
-inline bool v_check_all(const v_int16x8& a)
-{ return v_check_all(v_reinterpret_as_u16(a)); }
-inline bool v_check_all(const v_int32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
-inline bool v_check_all(const v_float32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
-
-inline bool v_check_any(const v_int8x16& a)
-{ return v_check_all(v_reinterpret_as_u8(a)); }
-inline bool v_check_any(const v_int16x8& a)
-{ return v_check_all(v_reinterpret_as_u16(a)); }
-inline bool v_check_any(const v_int32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
-inline bool v_check_any(const v_float32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
-
-#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
-inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
-}
-
-OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
-OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
-OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
-OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
-OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
-OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
-OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
-
-#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
-inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
-{ \
-    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
-    b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
-} \
-inline _Tpwvec v_load_expand(const _Tp* ptr) \
-{ \
-    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
-}
-
-OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
-OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
-OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
-OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
-
-inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{
-    uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
-    uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
-    return v_uint32x4(vmovl_u16(v1));
-}
-
-inline v_int32x4 v_load_expand_q(const schar* ptr)
-{
-    int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
-    int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
-    return v_int32x4(vmovl_s16(v1));
-}
-
-#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
-inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
-{ \
-    _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
-    b0.val = p.val[0]; \
-    b1.val = p.val[1]; \
-} \
-inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
-{ \
-    return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
-} \
-inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
-{ \
-    return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
-} \
-inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
-{ \
-    c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
-    d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
-}
-
-OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
-OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
-OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
-OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
-OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
-OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
-OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
-
-inline v_int32x4 v_round(const v_float32x4& a)
-{
-    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
-        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
-
-    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
-    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
-}
-
-inline v_int32x4 v_floor(const v_float32x4& a)
-{
-    int32x4_t a1 = vcvtq_s32_f32(a.val);
-    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
-    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
-}
-
-inline v_int32x4 v_ceil(const v_float32x4& a)
-{
-    int32x4_t a1 = vcvtq_s32_f32(a.val);
-    uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
-    return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
-}
-
-inline v_int32x4 v_trunc(const v_float32x4& a)
-{ return v_int32x4(vcvtq_s32_f32(a.val)); }
-
-#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
-inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
-                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
-                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
-                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
-{ \
-    /* m00 m01 m02 m03 */ \
-    /* m10 m11 m12 m13 */ \
-    /* m20 m21 m22 m23 */ \
-    /* m30 m31 m32 m33 */ \
-    _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
-    _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
-    /* m00 m10 m02 m12 */ \
-    /* m01 m11 m03 m13 */ \
-    /* m20 m30 m22 m32 */ \
-    /* m21 m31 m23 m33 */ \
-    b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
-    b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
-    b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
-    b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
-}
-
-OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
-OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
-OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
-
-#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
-inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
-{ \
-    _Tpvec##x3_t v = vld3q_##suffix(ptr); \
-    a.val = v.val[0]; \
-    b.val = v.val[1]; \
-    c.val = v.val[2]; \
-} \
-inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
-                                v_##_Tpvec& c, v_##_Tpvec& d) \
-{ \
-    _Tpvec##x4_t v = vld4q_##suffix(ptr); \
-    a.val = v.val[0]; \
-    b.val = v.val[1]; \
-    c.val = v.val[2]; \
-    d.val = v.val[3]; \
-} \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
-{ \
-    _Tpvec##x3_t v; \
-    v.val[0] = a.val; \
-    v.val[1] = b.val; \
-    v.val[2] = c.val; \
-    vst3q_##suffix(ptr, v); \
-} \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
-                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
-{ \
-    _Tpvec##x4_t v; \
-    v.val[0] = a.val; \
-    v.val[1] = b.val; \
-    v.val[2] = c.val; \
-    v.val[3] = d.val; \
-    vst4q_##suffix(ptr, v); \
-}
-
-OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
-OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
-OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
-OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
-
-inline v_float32x4 v_cvt_f32(const v_int32x4& a)
-{
-    return v_float32x4(vcvtq_f32_s32(a.val));
-}
+#include "opencv2/hal/intrin_neon.hpp"
 
 #else
 
-typedef v_reg<uchar, 16> v_uint8x16;
-typedef v_reg<schar, 16> v_int8x16;
-typedef v_reg<ushort, 8> v_uint16x8;
-typedef v_reg<short, 8> v_int16x8;
-typedef v_reg<unsigned, 4> v_uint32x4;
-typedef v_reg<int, 4> v_int32x4;
-typedef v_reg<float, 4> v_float32x4;
-typedef v_reg<float, 8> v_float32x8;
-typedef v_reg<double, 2> v_float64x2;
-typedef v_reg<double, 4> v_float64x4;
-
-inline v_uint8x16 v_setzero_u8() { return v_uint8x16::zero(); }
-inline v_int8x16 v_setzero_s8() { return v_int8x16::zero(); }
-inline v_uint16x8 v_setzero_u16() { return v_uint16x8::zero(); }
-inline v_int16x8 v_setzero_s16() { return v_int16x8::zero(); }
-inline v_uint32x4 v_setzero_u32() { return v_uint32x4::zero(); }
-inline v_int32x4 v_setzero_s32() { return v_int32x4::zero(); }
-inline v_float32x4 v_setzero_f32() { return v_float32x4::zero(); }
-inline v_float64x2 v_setzero_f64() { return v_float64x2::zero(); }
-
-inline v_uint8x16 v_setall_u8(uchar v) { return v_uint8x16::all(v); }
-inline v_int8x16 v_setall_s8(schar v) { return v_int8x16::all(v); }
-inline v_uint16x8 v_setall_u16(ushort v) { return v_uint16x8::all(v); }
-inline v_int16x8 v_setall_s16(short v) { return v_int16x8::all(v); }
-inline v_uint32x4 v_setall_u32(unsigned v) { return v_uint32x4::all(v); }
-inline v_int32x4 v_setall_s32(int v) { return v_int32x4::all(v); }
-inline v_float32x4 v_setall_f32(float v) { return v_float32x4::all(v); }
-inline v_float64x2 v_setall_f64(double v) { return v_float64x2::all(v); }
-
-template<typename _Tp, int n> inline v_uint8x16 v_reinterpret_as_u8(const v_reg<_Tp, n>& a)
-{ return v_reg<_Tp, n>::template reinterpret_as<uchar, 16>(a); }
-
-template<typename _Tp, int n> inline v_int8x16 v_reinterpret_as_s8(const v_reg<_Tp, n>& a)
-{ return v_reg<_Tp, n>::template reinterpret_as<schar, 16>(a); }
-
-template<typename _Tp, int n> inline v_uint16x8 v_reinterpret_as_u16(const v_reg<_Tp, n>& a)
-{ return v_reg<_Tp, n>::template reinterpret_as<ushort, 8>(a); }
-
-template<typename _Tp, int n> inline v_int16x8 v_reinterpret_as_s16(const v_reg<_Tp, n>& a)
-{ return v_reg<_Tp, n>::template reinterpret_as<short, 8>(a); }
-
-template<typename _Tp, int n> inline v_uint32x4 v_reinterpret_as_u32(const v_reg<_Tp, n>& a)
-{ return v_reg<_Tp, n>::template reinterpret_as<uint, 4>(a); }
-
-template<typename _Tp, int n> inline v_int32x4 v_reinterpret_as_s32(const v_reg<_Tp, n>& a)
-{ return v_reg<_Tp, n>::template reinterpret_as<int, 4>(a); }
-
-template<typename _Tp, int n> inline v_float32x4 v_reinterpret_as_f32(const v_reg<_Tp, n>& a)
-{ return v_reg<_Tp, n>::template reinterpret_as<float, 4>(a); }
-
-template<typename _Tp, int n> inline v_float64x2 v_reinterpret_as_f64(const v_reg<_Tp, n>& a)
-{ return v_reg<_Tp, n>::template reinterpret_as<double, 2>(a); }
-
-inline v_uint8x16 v_cvtn_u16(const v_uint16x8& a, const v_uint16x8& b)
-{ return v_cvtsat<ushort, 8, uchar>(a, b); }
-inline v_uint8x16 v_shiftn_u16(const v_uint16x8& a, const v_uint16x8& b, int n)
-{ return v_cvtsat<ushort, 8, uchar>(a, b, n); }
-inline v_uint8x16 v_cvtun_s16(const v_int16x8& a, const v_int16x8& b)
-{ return v_cvtsat<short, 8, uchar>(a, b); }
-inline v_uint8x16 v_shiftun_s16(const v_int16x8& a, const v_int16x8& b, int n)
-{ return v_cvtsat<short, 8, uchar>(a, b, n); }
-
-inline void v_storen_u16(uchar* ptr, const v_uint16x8& b)
-{ return v_storesat(ptr, b); }
-inline void v_shiftstoren_u16(uchar* ptr, const v_uint16x8& b, int n)
-{ return v_storesat(ptr, b, n); }
-inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& b)
-{ return v_storesat(ptr, b); }
-inline void v_shiftstoreun_s16(uchar* ptr, const v_int16x8& b, int n)
-{ return v_storesat(ptr, b, n); }
-
-inline v_int8x16 v_cvtn_s16(const v_int16x8& a, const v_int16x8& b)
-{ return v_cvtsat<short, 8, schar>(a, b); }
-inline v_int8x16 v_shiftn_s16(const v_int16x8& a, const v_int16x8& b, int n)
-{ return v_cvtsat<short, 8, schar>(a, b, n); }
-
-inline void v_storen_s16(schar* ptr, const v_int16x8& b)
-{ return v_storesat(ptr, b); }
-inline void v_shiftstoren_s16(schar* ptr, const v_int16x8& b, int n)
-{ return v_storesat(ptr, b, n); }
-
-inline v_uint16x8 v_cvtn_u32(const v_uint32x4& a, const v_uint32x4& b)
-{ return v_cvtsat<uint, 4, ushort>(a, b); }
-inline v_uint16x8 v_shiftn_u32(const v_uint32x4& a, const v_uint32x4& b, int n)
-{ return v_cvtsat<uint, 4, ushort>(a, b, n); }
-inline v_uint16x8 v_cvtun_s32(const v_int32x4& a, const v_int32x4& b)
-{ return v_cvtsat<int, 4, ushort>(a, b); }
-inline v_uint16x8 v_shiftun_s32(const v_int32x4& a, const v_int32x4& b, int n)
-{ return v_cvtsat<int, 4, ushort>(a, b, n); }
-
-inline void v_storen_u32(ushort* ptr, const v_uint32x4& b)
-{ return v_storesat(ptr, b); }
-inline void v_shiftstoren_u32(ushort* ptr, const v_uint32x4& b, int n)
-{ return v_storesat(ptr, b, n); }
-inline void v_storeun_s32(ushort* ptr, const v_int32x4& b)
-{ return v_storesat(ptr, b); }
-inline void v_shiftstoreun_s32(ushort* ptr, const v_int32x4& b, int n)
-{ return v_storesat(ptr, b, n); }
-
-inline v_int16x8 v_cvtn_s32(const v_int32x4& a, const v_int32x4& b)
-{ return v_cvtsat<int, 4, short>(a, b); }
-inline v_int16x8 v_shiftn_s32(const v_int32x4& a, const v_int32x4& b, int n)
-{ return v_cvtsat<int, 4, short>(a, b, n); }
-
-inline void v_storen_s32(short* ptr, const v_int32x4& b)
-{ return v_storesat(ptr, b); }
-inline void v_shiftstoren_s32(short* ptr, const v_int32x4& b, int n)
-{ return v_storesat(ptr, b, n); }
-
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
-{
-    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
-                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
-                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
-                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
-}
+#include "opencv2/hal/intrin_cpp.hpp"
 
 #endif
 
-}}
-
 #ifndef CV_SIMD128
 #define CV_SIMD128 0
 #endif
diff --git a/modules/hal/include/opencv2/hal/intrin_cpp.hpp b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
new file mode 100644
index 000000000..d0d5b28a2
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
@@ -0,0 +1,811 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
+#define __OPENCV_HAL_INTRIN_CPP_HPP__
+
+namespace cv
+{
+
+template<typename _Tp, int n> struct v_reg
+{
+    typedef _Tp lane_type;
+    typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
+    typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
+    enum { nlanes = n };
+
+    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
+    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+    }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
+           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
+           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
+        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
+    }
+
+    v_reg() {}
+    v_reg(const v_reg<_Tp, n> & r)
+    {
+        for( int i = 0; i < n; i++ )
+            s[i] = r.s[i];
+    }
+
+    _Tp get(const int i) const { return s[i]; }
+    _Tp get0() const { return s[0]; }
+    v_reg<_Tp, n> high() const
+    {
+        v_reg<_Tp, n> c;
+        int i;
+        for( i = 0; i < n/2; i++ )
+        {
+            c.s[i] = s[i+(n/2)];
+            c.s[i+(n/2)] = 0;
+        }
+        return c;
+    }
+
+    static v_reg<_Tp, n> zero()
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = (_Tp)0;
+        return c;
+    }
+
+    static v_reg<_Tp, n> all(_Tp s)
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = s;
+        return c;
+    }
+
+    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
+    {
+        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
+        v_reg<_Tp2, n2> c;
+        memcpy(&c.s[0], &s[0], bytes);
+        return c;
+    }
+
+    _Tp s[n];
+};
+
+#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> \
+    operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return c; \
+} \
+template<typename _Tp, int n> inline v_reg<_Tp, n>& \
+    operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_BIN_OP(+)
+OPENCV_HAL_IMPL_BIN_OP(-)
+OPENCV_HAL_IMPL_BIN_OP(*)
+OPENCV_HAL_IMPL_BIN_OP(/)
+
+#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
+    (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return c; \
+} \
+template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
+    bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_BIT_OP(&)
+OPENCV_HAL_IMPL_BIT_OP(|)
+OPENCV_HAL_IMPL_BIT_OP(^)
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
+        return c;
+}
+
+#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
+template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp2, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
+                          typename V_TypeTraits<_Tp>::abs_type)
+OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
+
+#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i], b.s[i]); \
+    return c; \
+} \
+template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
+{ \
+    _Tp c = a.s[0]; \
+    for( int i = 1; i < n; i++ ) \
+        c = cfunc(c, a.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min)
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max)
+
+template<typename _Tp, int n>
+inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
+{
+    for( int i = 0; i < n; i++ )
+    {
+        minval.s[i] = std::min(a.s[i], b.s[i]);
+        maxval.s[i] = std::max(a.s[i], b.s[i]);
+    }
+}
+
+
+#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_CMP_OP(<)
+OPENCV_HAL_IMPL_CMP_OP(>)
+OPENCV_HAL_IMPL_CMP_OP(<=)
+OPENCV_HAL_IMPL_CMP_OP(>=)
+OPENCV_HAL_IMPL_CMP_OP(==)
+OPENCV_HAL_IMPL_CMP_OP(!=)
+
+#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef _Tp2 rtype; \
+    v_reg<rtype, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type)
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = 1.f/std::sqrt(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
+    return c;
+}
+
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
+    return c;
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                              const v_reg<_Tp, n>& c)
+{
+    v_reg<_Tp, n> d;
+    for( int i = 0; i < n; i++ )
+        d.s[i] = a.s[i]*b.s[i] + c.s[i];
+    return d;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i]*b.s[i]*2;
+        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
+                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
+    }
+}
+
+#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_SHIFT_OP(<<)
+OPENCV_HAL_IMPL_SHIFT_OP(>>)
+
+template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
+{
+    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
+    for( int i = 1; i < n; i++ )
+        c += a.s[i];
+    return c;
+}
+
+template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
+{
+    int mask = 0;
+    for( int i = 0; i < n; i++ )
+        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
+    return mask;
+}
+
+template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
+            return false;
+    return true;
+}
+
+template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
+            return true;
+    return false;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
+                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i];
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        b0.s[i] = a.s[i];
+        b1.s[i] = a.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
+    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
+    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
+{
+    int i;
+    for( i = 0; i < n/2; i++ )
+    {
+        b0.s[i*2] = a0.s[i];
+        b0.s[i*2+1] = a1.s[i];
+    }
+    for( ; i < n; i++ )
+    {
+        b1.s[i*2-n] = a0.s[i];
+        b1.s[i*2-n+1] = a1.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr)
+{
+    return v_reg<_Tp, n>(ptr);
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr)
+{
+    return v_reg<_Tp, n>(ptr);
+}
+
+template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n/2; i++ )
+    {
+        c.s[i] = loptr[i];
+        c.s[i+n/2] = hiptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename
+    V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr)
+{
+    typedef typename V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type w_type;
+    v_reg<w_type, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
+{
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        a.s[i] = ptr[i3];
+        b.s[i] = ptr[i3+1];
+        c.s[i] = ptr[i3+2];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
+                                v_reg<_Tp, n>& d)
+{
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        a.s[i] = ptr[i4];
+        b.s[i] = ptr[i4+1];
+        c.s[i] = ptr[i4+2];
+        d.s[i] = ptr[i4+3];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+{
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        ptr[i3] = a.s[i];
+        ptr[i3+1] = b.s[i];
+        ptr[i3+2] = c.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                                            const v_reg<_Tp, n>& d)
+{
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        ptr[i4] = a.s[i];
+        ptr[i4+1] = b.s[i];
+        ptr[i4+2] = c.s[i];
+        ptr[i4+3] = d.s[i];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i+(n/2)];
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i];
+        c.s[i+(n/2)] = b.s[i];
+    }
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i+(n/2)];
+        c.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        low.s[i] = a.s[i];
+        low.s[i+(n/2)] = b.s[i];
+        high.s[i] = a.s[i+(n/2)];
+        high.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvRound(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvFloor(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvCeil(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (int)(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvFloor(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (float)a.s[i];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+template<typename _Tp>
+inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
+                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
+                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
+                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
+{
+    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
+    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
+    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
+    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
+}
+
+typedef v_reg<uchar, 16> v_uint8x16;
+typedef v_reg<schar, 16> v_int8x16;
+typedef v_reg<ushort, 8> v_uint16x8;
+typedef v_reg<short, 8> v_int16x8;
+typedef v_reg<unsigned, 4> v_uint32x4;
+typedef v_reg<int, 4> v_int32x4;
+typedef v_reg<float, 4> v_float32x4;
+typedef v_reg<float, 8> v_float32x8;
+typedef v_reg<double, 2> v_float64x2;
+typedef v_reg<uint64, 2> v_uint64x2;
+typedef v_reg<int64, 2> v_int64x2;
+
+#define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \
+inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
+template<typename _Tp0, int n0> inline _Tpvec \
+    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
+{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); }
+
+OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64)
+
+#define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \
+template<int n> inline _Tpvec v_lshift(const _Tpvec& a) \
+{ return a << n; } \
+template<int n> inline _Tpvec v_rshift(const _Tpvec& a) \
+{ return a >> n; } \
+template<int n> inline _Tpvec v_rshift_round(const _Tpvec& a) \
+{ \
+    _Tpvec c; \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short)
+OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int)
+OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64)
+
+
+#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
+inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpnvec c; \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    { \
+        c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
+        c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
+    } \
+    return c; \
+} \
+template<int n> inline _Tpnvec v_rshift_round_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpnvec c; \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    { \
+        c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+        c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    } \
+    return c; \
+} \
+inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+{ \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
+} \
+template<int n> inline void v_rshift_round_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+{ \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+}
+
+OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
+OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
+OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack)
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
+                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
+                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
+                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
+}
+
+}
+
+#endif
diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp
new file mode 100644
index 000000000..4bda95db6
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp
@@ -0,0 +1,823 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_INTRIN_NEON_HPP__
+#define __OPENCV_HAL_INTRIN_NEON_HPP__
+
+namespace cv
+{
+
+#define CV_SIMD128 1
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(uint8x16_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_u8(v);
+    }
+    uchar get0() const
+    {
+        return vgetq_lane_u8(val, 0);
+    }
+
+    uint8x16_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(int8x16_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_s8(v);
+    }
+    schar get0() const
+    {
+        return vgetq_lane_s8(val, 0);
+    }
+
+    int8x16_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(uint16x8_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_u16(v);
+    }
+    ushort get0() const
+    {
+        return vgetq_lane_u16(val, 0);
+    }
+
+    uint16x8_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(int16x8_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_s16(v);
+    }
+    short get0() const
+    {
+        return vgetq_lane_s16(val, 0);
+    }
+
+    int16x8_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(uint32x4_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = vld1q_u32(v);
+    }
+    unsigned get0() const
+    {
+        return vgetq_lane_u32(val, 0);
+    }
+
+    uint32x4_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(int32x4_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = vld1q_s32(v);
+    }
+    int get0() const
+    {
+        return vgetq_lane_s32(val, 0);
+    }
+    int32x4_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(float32x4_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = vld1q_f32(v);
+    }
+    float get0() const
+    {
+        return vgetq_lane_f32(val, 0);
+    }
+    float32x4_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(uint64x2_t v) : val(v) {}
+    v_uint64x2(unsigned v0, unsigned v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = vld1q_u64(v);
+    }
+    uint64 get0() const
+    {
+        return vgetq_lane_u64(val, 0);
+    }
+    uint64x2_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(int64x2_t v) : val(v) {}
+    v_int64x2(int v0, int v1)
+    {
+        int64 v[] = {v0, v1};
+        val = vld1q_s64(v);
+    }
+    int64 get0() const
+    {
+        return vgetq_lane_s64(val, 0);
+    }
+    int64x2_t val;
+};
+
+#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
+inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
+
+OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
+
+#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = vqmov##op##_##wsuffix(a.val); \
+    vst1_##suffix(ptr, a1); \
+} \
+template<int n> inline \
+_Tpvec v_rshift_round_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
+    hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+template<int n> inline \
+void v_rshift_round_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
+    vst1_##suffix(ptr, a1); \
+}
+
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
+OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
+OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u32, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
+    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
+    res = vmlaq_lane_f32(res, m1.val, vl, 1);
+    res = vmlaq_lane_f32(res, m2.val, vh, 0);
+    res = vmlaq_lane_f32(res, m3.val, vh, 1);
+    return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
+
+inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    return v_float32x4(vmulq_f32(a.val, reciprocal));
+}
+inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    a.val = vmulq_f32(a.val, reciprocal);
+    return a;
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
+    d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
+    d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+    int32x4x2_t cd = vtrnq_s32(c, d);
+    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
+}
+
+#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
+    }
+
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
+
+#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
+}
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
+    float32x4_t e = vrsqrteq_f32(x1);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    return v_float32x4(vmulq_f32(x.val, e));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    float32x4_t e = vrsqrteq_f32(x.val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    return v_float32x4(e);
+}
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{ return v_float32x4(vabsq_f32(x.val)); }
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
+
+
+#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
+
+// TODO: absdiff for signed integers
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+}
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
+template<int n> inline _Tpvec lshift(const _Tpvec& a) \
+{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec rshift(const _Tpvec& a) \
+{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec rshift_round(const _Tpvec& a) \
+{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
+
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
+
+#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
+
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
+
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+inline int v_signmask(const v_uint8x16& a)
+{
+    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+    int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(v0);
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
+    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
+    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
+}
+
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
+
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
+    uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
+    return v_uint32x4(vmovl_u16(v1));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
+    int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
+    return v_int32x4(vmovl_s16(v1));
+}
+
+#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
+    b0.val = p.val[0]; \
+    b1.val = p.val[1]; \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
+    d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
+        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+
+    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
+    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
+    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
+    return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(vcvtq_s32_f32(a.val)); }
+
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* m00 m01 m02 m03 */ \
+    /* m10 m11 m12 m13 */ \
+    /* m20 m21 m22 m23 */ \
+    /* m30 m31 m32 m33 */ \
+    _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
+    _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
+    /* m00 m10 m02 m12 */ \
+    /* m01 m11 m03 m13 */ \
+    /* m20 m30 m22 m32 */ \
+    /* m21 m31 m23 m33 */ \
+    b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
+    b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
+    b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
+    b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
+
+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v = vld3q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v = vld4q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+    d.val = v.val[3]; \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    vst3q_##suffix(ptr, v); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    v.val[3] = d.val; \
+    vst4q_##suffix(ptr, v); \
+}
+
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vcvtq_f32_s32(a.val));
+}
+
+}
+
+#endif
diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp
new file mode 100644
index 000000000..12f5789ea
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp
@@ -0,0 +1,1544 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_SSE_HPP__
+#define __OPENCV_HAL_SSE_HPP__
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+
+namespace cv
+{
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(__m128i v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+    uchar get0() const
+    {
+        return (uchar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(__m128i v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+              schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+    schar get0() const
+    {
+        return (schar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(__m128i v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+    ushort get0() const
+    {
+        return (ushort)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(__m128i v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+    short get0() const
+    {
+        return (short)_mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(__m128i v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
+    }
+    unsigned get0() const
+    {
+        return (unsigned)_mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(__m128i v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        val = _mm_setr_epi32(v0, v1, v2, v3);
+    }
+    int get0() const
+    {
+        return _mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(__m128 v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        val = _mm_setr_ps(v0, v1, v2, v3);
+    }
+    float get0() const
+    {
+        return _mm_cvtss_f32(val);
+    }
+    __m128 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(__m128i v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+    }
+    uint64 get0() const
+    {
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    }
+    __m128i val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(__m128i v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+    }
+    int64 get0() const
+    {
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    }
+    __m128i val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(__m128d v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        val = _mm_setr_pd(v0, v1);
+    }
+    double get0() const
+    {
+        return _mm_cvtsd_f64(val);
+    }
+    __m128d val;
+};
+
+#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
+{ return _Tpvec(cast(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
+
+inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
+inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
+inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
+inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
+
+template<typename _Tpvec> inline
+v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
+template<typename _Tpvec> inline
+v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
+{ return _Tpvec(_mm_castps_si128(a.val)); } \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
+{ return _Tpvec(_mm_castpd_si128(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
+
+//////////////// PACK ///////////////
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
+                                       _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
+}
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_uint8x16 v_rshift_round_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
+                                       _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshift_round_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+template<int n> inline
+v_uint8x16 v_rshift_round_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                       _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshift_round_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
+
+inline void v_pack_store(schar* ptr, v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_int8x16 v_rshift_round_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                     _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+template<int n> inline
+void v_rshift_round_pack_store(schar* ptr, const v_int16x8& a)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
+}
+
+
+// bit-wise "mask ? a : b"
+inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
+{
+    return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
+}
+
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
+    __m128i r = _mm_packs_epi32(a1, b1);
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i r = _mm_packs_epi32(a1, a1);
+    _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+v_uint16x8 v_rshift_round_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+void v_rshift_round_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta32 = _mm_set1_epi32(32768);
+    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    __m128i delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(a.val, delta32);
+    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, r);
+}
+
+template<int n> inline
+void v_rshift_round_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
+
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
+}
+
+template<int n> inline
+v_int16x8 v_rshift_round_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
+                                     _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshift_round_pack_store(short* ptr, const v_int32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
+}
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi64(v0, v1));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+template<int n> inline
+v_uint32x4 v_rshift_round_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128 a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128 b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
+}
+
+template<int n> inline
+void v_rshift_round_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline __m128i v_sign_epi64(__m128i a)
+{
+    return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
+}
+
+inline __m128i v_srai_epi64(__m128i a, int imm)
+{
+    __m128i smask = v_sign_epi64(a);
+    return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
+}
+
+template<int n> inline
+v_int32x4 v_rshift_round_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi64(v0, v1));
+}
+
+template<int n> inline
+void v_rshift_round_pack_store(int* ptr, const v_int64x2& a)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+    __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
+}
+
+
+#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return _Tpvec(intrin(a.val, b.val)); \
+    } \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+    { \
+        a.val = intrin(a.val, b.val); \
+        return a; \
+    }
+
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
+
+inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
+}
+inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return v_int32x4(_mm_unpacklo_epi64(d0, d1));
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
+    c.val = _mm_unpacklo_epi32(v0, v1);
+    d.val = _mm_unpackhi_epi32(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
+    c.val = _mm_unpacklo_epi32(v0, v1);
+    d.val = _mm_unpackhi_epi32(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    c.val = _mm_unpacklo_epi64(c0, c1);
+    d.val = _mm_unpackhi_epi64(c0, c1);
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int32x4(_mm_madd_epi16(a.val, b.val));
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
+    }
+
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{ return v_float32x4(_mm_sqrt_ps(x.val)); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
+    __m128 t = x.val;
+    __m128 h = _mm_mul_ps(t, _0_5);
+    t = _mm_rsqrt_ps(t);
+    t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
+    return v_float32x4(t);
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{ return v_float64x2(_mm_sqrt_pd(x.val)); }
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    static const __m128d v_1 = _mm_set1_pd(1.);
+    return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
+}
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{
+    return v_float64x2(_mm_and_pd(x.val,
+        _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
+
+inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+}
+inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+}
+inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
+}
+inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
+}
+inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, b.val, a.val));
+}
+inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, a.val, b.val));
+}
+inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
+}
+inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
+}
+
+#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
+inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
+{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
+{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
+} \
+inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
+} \
+inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
+} \
+inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
+} \
+inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
+}
+
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
+
+#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
+inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
+} \
+inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i smask = _mm_set1_epi32(smask32); \
+    __m128i a1 = _mm_xor_si128(a.val, smask); \
+    __m128i b1 = _mm_xor_si128(b.val, smask); \
+    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
+}
+
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
+
+#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
+inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
+    return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
+} \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
+    return _Tpvec(_mm_sqrt_##suffix(res)); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
+    return _Tpvec(res); \
+} \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ \
+    return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
+}
+
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
+
+#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_lshift(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_lshift(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_rshift(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_rshift(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+}
+
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                                     _mm_loadl_epi64((const __m128i*)ptr1))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_si128((__m128i*)ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_##suffix(ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_castsi128_##suffix( \
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                           _mm_loadl_epi64((const __m128i*)ptr1)))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __m128i a1 = _mm_cast##suffix##_si128(a.val); \
+    _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
+} \
+inline bool v_check_all(const _Tpvec& a) \
+{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
+inline bool v_check_any(const _Tpvec& a) \
+{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
+
+#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
+inline __m128i v_packq_epi32(__m128i a)
+{
+    __m128i b = _mm_packs_epi32(a, a);
+    return _mm_packs_epi16(b, b);
+}
+
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
+
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
+
+#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
+inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
+{ \
+    __m128i z = _mm_setzero_si128(); \
+    b0.val = _mm_unpacklo_##suffix(a.val, z); \
+    b1.val = _mm_unpackhi_##suffix(a.val, z); \
+} \
+inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
+{ \
+    __m128i z = _mm_setzero_si128(); \
+    return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
+} \
+inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
+{ \
+    b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
+    b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
+} \
+inline _Tpwsvec v_load_expand(const _Tps* ptr) \
+{ \
+    __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
+    return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
+}
+
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
+
+inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
+{
+    __m128i z = _mm_setzero_si128();
+    b0.val = _mm_unpacklo_epi32(a.val, z);
+    b1.val = _mm_unpackhi_epi32(a.val, z);
+}
+inline v_uint64x2 v_load_expand(const unsigned* ptr)
+{
+    __m128i z = _mm_setzero_si128();
+    return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
+}
+inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
+{
+    __m128i s = _mm_srai_epi32(a.val, 31);
+    b0.val = _mm_unpacklo_epi32(a.val, s);
+    b1.val = _mm_unpackhi_epi32(a.val, s);
+}
+inline v_int64x2 v_load_expand(const int* ptr)
+{
+    __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
+    __m128i s = _mm_srai_epi32(a, 31);
+    return v_int64x2(_mm_unpacklo_epi32(a, s));
+}
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    __m128i z = _mm_setzero_si128();
+    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
+    return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
+    a = _mm_unpacklo_epi8(a, a);
+    a = _mm_unpacklo_epi8(a, a);
+    return v_int32x4(_mm_srai_epi32(a, 24));
+}
+
+#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
+    b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
+    d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(_mm_cvttps_epi32(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(_mm_cvttpd_epi32(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
+    __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
+    __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
+    __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
+\
+    b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
+    b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
+    b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
+    b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
+}
+
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+
+// adopted from sse_utils.hpp
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+
+    __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
+
+    __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
+    __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
+    __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
+
+    a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
+    b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
+    c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
+
+    u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
+    u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
+    u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
+    u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
+
+    v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
+    v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
+    v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ...
+    v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ...
+
+    a.val = _mm_unpacklo_epi8(v0, v1);
+    b.val = _mm_unpacklo_epi8(v2, v3);
+    c.val = _mm_unpackhi_epi8(v0, v1);
+    d.val = _mm_unpacklo_epi8(v2, v3);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+
+    __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
+
+    a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
+    b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
+    c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
+
+    u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
+    u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
+    u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
+    u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
+
+    a.val = _mm_unpacklo_epi16(u0, u1);
+    b.val = _mm_unpackhi_epi16(u0, u1);
+    c.val = _mm_unpacklo_epi16(u2, u3);
+    d.val = _mm_unpackhi_epi16(u2, u3);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+
+    __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
+
+    a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
+    b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
+    c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
+    v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
+    v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
+    v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
+
+    v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c )
+{
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi8(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi8(c.val, z);
+
+    __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
+    __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
+    __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
+    __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
+
+    __m128i p10 = _mm_unpacklo_epi32(p00, p01);
+    __m128i p11 = _mm_unpackhi_epi32(p00, p01);
+    __m128i p12 = _mm_unpacklo_epi32(p02, p03);
+    __m128i p13 = _mm_unpackhi_epi32(p02, p03);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 1);
+    p22 = _mm_slli_si128(p22, 1);
+
+    __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
+    __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
+    __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
+    __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
+
+    __m128i p40 = _mm_unpacklo_epi64(p30, p31);
+    __m128i p41 = _mm_unpackhi_epi64(p30, p31);
+    __m128i p42 = _mm_unpacklo_epi64(p32, p33);
+    __m128i p43 = _mm_unpackhi_epi64(p32, p33);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
+
+    _mm_storeu_si128((__m128i*)(ptr), v0);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, const v_uint8x16& d)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
+    __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
+    __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+    _mm_storeu_si128((__m128i*)(ptr + 32), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
+                                const v_uint16x8& b,
+                                const v_uint16x8& c )
+{
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi16(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi16(c.val, z);
+
+    __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
+    __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
+    __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
+    __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 2);
+    p22 = _mm_slli_si128(p22, 2);
+
+    __m128i p30 = _mm_unpacklo_epi64(p20, p21);
+    __m128i p31 = _mm_unpackhi_epi64(p20, p21);
+    __m128i p32 = _mm_unpacklo_epi64(p22, p23);
+    __m128i p33 = _mm_unpackhi_epi64(p22, p23);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
+
+    _mm_storeu_si128((__m128i*)(ptr), v0);
+    _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                const v_uint16x8& c, const v_uint16x8& d)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
+    __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
+    __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                const v_uint32x4& c )
+{
+    v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
+    v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
+
+    __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
+
+    _mm_storeu_si128((__m128i*)ptr, v0);
+    _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+    _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d)
+{
+    v_uint32x4 t0, t1, t2, t3;
+    v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
+    v_store(ptr, t0);
+    v_store(ptr + 4, t1);
+    v_store(ptr + 8, t2);
+    v_store(ptr + 12, t3);
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
+inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
+                                 _Tpvec& b0, _Tpvec& c0 ) \
+{ \
+    _Tpuvec a1, b1, c1; \
+    v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix(a1); \
+    b0 = v_reinterpret_as_##suffix(b1); \
+    c0 = v_reinterpret_as_##suffix(c1); \
+} \
+inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
+                                 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
+{ \
+    _Tpuvec a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix(a1); \
+    b0 = v_reinterpret_as_##suffix(b1); \
+    c0 = v_reinterpret_as_##suffix(c1); \
+    d0 = v_reinterpret_as_##suffix(d1); \
+} \
+inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
+                               const _Tpvec& b0, const _Tpvec& c0 ) \
+{ \
+    _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
+    _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
+    _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
+    v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
+} \
+inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
+                               const _Tpvec& c0, const _Tpvec& d0 ) \
+{ \
+    _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
+    _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
+    _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
+    _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
+    v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(_mm_cvtepi32_ps(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    return v_float32x4(_mm_cvtpd_ps(a.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(_mm_cvtepi32_pd(a.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(_mm_cvtps_pd(a.val));
+}
+
+}
+
+#endif

From 6916a1438fdb9dab7b459077dc91e65e00a020ff Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Mon, 20 Apr 2015 21:34:37 +0300
Subject: [PATCH 33/48] fixed some newly introduced compile errors and warnings

---
 .../hal/include/opencv2/hal/intrin_cpp.hpp    | 10 ++---
 .../hal/include/opencv2/hal/intrin_neon.hpp   | 12 +++---
 .../hal/include/opencv2/hal/intrin_sse.hpp    | 42 +++++++++----------
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/modules/hal/include/opencv2/hal/intrin_cpp.hpp b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
index d0d5b28a2..e0140a863 100644
--- a/modules/hal/include/opencv2/hal/intrin_cpp.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
@@ -735,11 +735,11 @@ OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64)
 OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64)
 
 #define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \
-template<int n> inline _Tpvec v_lshift(const _Tpvec& a) \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return a << n; } \
-template<int n> inline _Tpvec v_rshift(const _Tpvec& a) \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return a >> n; } \
-template<int n> inline _Tpvec v_rshift_round(const _Tpvec& a) \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
 { \
     _Tpvec c; \
     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
@@ -766,7 +766,7 @@ inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
     } \
     return c; \
 } \
-template<int n> inline _Tpnvec v_rshift_round_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
 { \
     _Tpnvec c; \
     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
@@ -781,7 +781,7 @@ inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
         ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
 } \
-template<int n> inline void v_rshift_round_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
 { \
     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
         ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
diff --git a/modules/hal/include/opencv2/hal/intrin_neon.hpp b/modules/hal/include/opencv2/hal/intrin_neon.hpp
index 4bda95db6..ab6aa8631 100644
--- a/modules/hal/include/opencv2/hal/intrin_neon.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp
@@ -264,14 +264,14 @@ inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
     vst1_##suffix(ptr, a1); \
 } \
 template<int n> inline \
-_Tpvec v_rshift_round_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
 { \
     hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
     hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
     return _Tpvec(vcombine_##suffix(a1, b1)); \
 } \
 template<int n> inline \
-void v_rshift_round_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
 { \
     hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
     vst1_##suffix(ptr, a1); \
@@ -283,7 +283,7 @@ OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack,
 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
-OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u32, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
 
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
@@ -516,11 +516,11 @@ inline _Tpvec operator << (const _Tpvec& a, int n) \
 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
 inline _Tpvec operator >> (const _Tpvec& a, int n) \
 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
-template<int n> inline _Tpvec lshift(const _Tpvec& a) \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
-template<int n> inline _Tpvec rshift(const _Tpvec& a) \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
-template<int n> inline _Tpvec rshift_round(const _Tpvec& a) \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
 
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
diff --git a/modules/hal/include/opencv2/hal/intrin_sse.hpp b/modules/hal/include/opencv2/hal/intrin_sse.hpp
index 12f5789ea..3b77a1154 100644
--- a/modules/hal/include/opencv2/hal/intrin_sse.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp
@@ -318,7 +318,7 @@ inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
 
 template<int n> inline
-v_uint8x16 v_rshift_round_pack(const v_uint16x8& a, const v_uint16x8& b)
+v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
 {
     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
@@ -327,7 +327,7 @@ v_uint8x16 v_rshift_round_pack(const v_uint16x8& a, const v_uint16x8& b)
 }
 
 template<int n> inline
-void v_rshift_round_pack_store(uchar* ptr, const v_uint16x8& a)
+void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
 {
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
     __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
@@ -335,7 +335,7 @@ void v_rshift_round_pack_store(uchar* ptr, const v_uint16x8& a)
 }
 
 template<int n> inline
-v_uint8x16 v_rshift_round_pack_u(const v_int16x8& a, const v_int16x8& b)
+v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
 {
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
     return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
@@ -343,7 +343,7 @@ v_uint8x16 v_rshift_round_pack_u(const v_int16x8& a, const v_int16x8& b)
 }
 
 template<int n> inline
-void v_rshift_round_pack_u_store(uchar* ptr, const v_int16x8& a)
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
 {
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
@@ -357,7 +357,7 @@ inline void v_pack_store(schar* ptr, v_int16x8& a)
 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
 
 template<int n> inline
-v_int8x16 v_rshift_round_pack(const v_int16x8& a, const v_int16x8& b)
+v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
 {
     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
@@ -365,7 +365,7 @@ v_int8x16 v_rshift_round_pack(const v_int16x8& a, const v_int16x8& b)
                                      _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
 }
 template<int n> inline
-void v_rshift_round_pack_store(schar* ptr, const v_int16x8& a)
+void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
 {
     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
@@ -398,7 +398,7 @@ inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
 }
 
 template<int n> inline
-v_uint16x8 v_rshift_round_pack(const v_uint32x4& a, const v_uint32x4& b)
+v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
@@ -407,7 +407,7 @@ v_uint16x8 v_rshift_round_pack(const v_uint32x4& a, const v_uint32x4& b)
 }
 
 template<int n> inline
-void v_rshift_round_pack_store(ushort* ptr, const v_uint32x4& a)
+void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
@@ -431,7 +431,7 @@ inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
 }
 
 template<int n> inline
-void v_rshift_round_pack_u_store(ushort* ptr, const v_int32x4& a)
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
@@ -448,7 +448,7 @@ inline void v_pack_store(short* ptr, const v_int32x4& a)
 }
 
 template<int n> inline
-v_int16x8 v_rshift_round_pack(const v_int32x4& a, const v_int32x4& b)
+v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1));
     return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
@@ -456,7 +456,7 @@ v_int16x8 v_rshift_round_pack(const v_int32x4& a, const v_int32x4& b)
 }
 
 template<int n> inline
-void v_rshift_round_pack_store(short* ptr, const v_int32x4& a)
+void v_rshr_pack_store(short* ptr, const v_int32x4& a)
 {
     __m128i delta = _mm_set1_epi32(1 << (n-1));
     __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
@@ -493,19 +493,19 @@ inline void v_pack_store(int* ptr, const v_int64x2& a)
 }
 
 template<int n> inline
-v_uint32x4 v_rshift_round_pack(const v_uint64x2& a, const v_uint64x2& b)
+v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
 {
     uint64 delta = (uint64)1 << (n-1);
     v_uint64x2 delta2(delta, delta);
-    __m128 a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
-    __m128 b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
     return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
 }
 
 template<int n> inline
-void v_rshift_round_pack_store(unsigned* ptr, const v_uint64x2& a)
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
 {
     uint64 delta = (uint64)1 << (n-1);
     v_uint64x2 delta2(delta, delta);
@@ -526,7 +526,7 @@ inline __m128i v_srai_epi64(__m128i a, int imm)
 }
 
 template<int n> inline
-v_int32x4 v_rshift_round_pack(const v_int64x2& a, const v_int64x2& b)
+v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
 {
     int64 delta = (int64)1 << (n-1);
     v_int64x2 delta2(delta, delta);
@@ -538,7 +538,7 @@ v_int32x4 v_rshift_round_pack(const v_int64x2& a, const v_int64x2& b)
 }
 
 template<int n> inline
-void v_rshift_round_pack_store(int* ptr, const v_int64x2& a)
+void v_rshr_pack_store(int* ptr, const v_int64x2& a)
 {
     int64 delta = (int64)1 << (n-1);
     v_int64x2 delta2(delta, delta);
@@ -901,22 +901,22 @@ inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
     return _Tpsvec(srai(a.val, imm)); \
 } \
 template<int imm> \
-inline _Tpuvec v_lshift(const _Tpuvec& a) \
+inline _Tpuvec v_shl(const _Tpuvec& a) \
 { \
     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
 } \
 template<int imm> \
-inline _Tpsvec v_lshift(const _Tpsvec& a) \
+inline _Tpsvec v_shl(const _Tpsvec& a) \
 { \
     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
 } \
 template<int imm> \
-inline _Tpuvec v_rshift(const _Tpuvec& a) \
+inline _Tpuvec v_shr(const _Tpuvec& a) \
 { \
     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
 } \
 template<int imm> \
-inline _Tpsvec v_rshift(const _Tpsvec& a) \
+inline _Tpsvec v_shr(const _Tpsvec& a) \
 { \
     return _Tpsvec(srai(a.val, imm)); \
 }

From 0fd1203b3b3863f524afe9adb99b3d074c4b3519 Mon Sep 17 00:00:00 2001
From: Nisarg Thakkar <nisargtha@gmail.com>
Date: Tue, 21 Apr 2015 10:37:50 +0530
Subject: [PATCH 34/48] Fixed Doc Errors Bug#4282

---
 modules/videoio/include/opencv2/videoio.hpp | 76 ++++++++++-----------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index ae958bcee..d1d3d7380 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -528,27 +528,27 @@ public:
     /** @brief Sets a property in the VideoCapture.
 
     @param propId Property identifier. It can be one of the following:
-     -   **CV_CAP_PROP_POS_MSEC** Current position of the video file in milliseconds.
-     -   **CV_CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
-     -   **CV_CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
+     -   **CAP_PROP_POS_MSEC** Current position of the video file in milliseconds.
+     -   **CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
+     -   **CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
          film, 1 - end of the film.
-     -   **CV_CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
-     -   **CV_CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
-     -   **CV_CAP_PROP_FPS** Frame rate.
-     -   **CV_CAP_PROP_FOURCC** 4-character code of codec.
-     -   **CV_CAP_PROP_FRAME_COUNT** Number of frames in the video file.
-     -   **CV_CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
-     -   **CV_CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
-     -   **CV_CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
-     -   **CV_CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
-     -   **CV_CAP_PROP_SATURATION** Saturation of the image (only for cameras).
-     -   **CV_CAP_PROP_HUE** Hue of the image (only for cameras).
-     -   **CV_CAP_PROP_GAIN** Gain of the image (only for cameras).
-     -   **CV_CAP_PROP_EXPOSURE** Exposure (only for cameras).
-     -   **CV_CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
+     -   **CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
+     -   **CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
+     -   **CAP_PROP_FPS** Frame rate.
+     -   **CAP_PROP_FOURCC** 4-character code of codec.
+     -   **CAP_PROP_FRAME_COUNT** Number of frames in the video file.
+     -   **CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
+     -   **CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
+     -   **CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
+     -   **CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
+     -   **CAP_PROP_SATURATION** Saturation of the image (only for cameras).
+     -   **CAP_PROP_HUE** Hue of the image (only for cameras).
+     -   **CAP_PROP_GAIN** Gain of the image (only for cameras).
+     -   **CAP_PROP_EXPOSURE** Exposure (only for cameras).
+     -   **CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
          to RGB.
-     -   **CV_CAP_PROP_WHITE_BALANCE** Currently unsupported
-     -   **CV_CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
+     -   **CAP_PROP_WHITE_BALANCE** Currently unsupported
+     -   **CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
          by DC1394 v 2.x backend currently)
     @param value Value of the property.
      */
@@ -557,28 +557,28 @@ public:
     /** @brief Returns the specified VideoCapture property
 
     @param propId Property identifier. It can be one of the following:
-     -   **CV_CAP_PROP_POS_MSEC** Current position of the video file in milliseconds or video
+     -   **CAP_PROP_POS_MSEC** Current position of the video file in milliseconds or video
          capture timestamp.
-     -   **CV_CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
-     -   **CV_CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
+     -   **CAP_PROP_POS_FRAMES** 0-based index of the frame to be decoded/captured next.
+     -   **CAP_PROP_POS_AVI_RATIO** Relative position of the video file: 0 - start of the
          film, 1 - end of the film.
-     -   **CV_CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
-     -   **CV_CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
-     -   **CV_CAP_PROP_FPS** Frame rate.
-     -   **CV_CAP_PROP_FOURCC** 4-character code of codec.
-     -   **CV_CAP_PROP_FRAME_COUNT** Number of frames in the video file.
-     -   **CV_CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
-     -   **CV_CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
-     -   **CV_CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
-     -   **CV_CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
-     -   **CV_CAP_PROP_SATURATION** Saturation of the image (only for cameras).
-     -   **CV_CAP_PROP_HUE** Hue of the image (only for cameras).
-     -   **CV_CAP_PROP_GAIN** Gain of the image (only for cameras).
-     -   **CV_CAP_PROP_EXPOSURE** Exposure (only for cameras).
-     -   **CV_CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
+     -   **CAP_PROP_FRAME_WIDTH** Width of the frames in the video stream.
+     -   **CAP_PROP_FRAME_HEIGHT** Height of the frames in the video stream.
+     -   **CAP_PROP_FPS** Frame rate.
+     -   **CAP_PROP_FOURCC** 4-character code of codec.
+     -   **CAP_PROP_FRAME_COUNT** Number of frames in the video file.
+     -   **CAP_PROP_FORMAT** Format of the Mat objects returned by retrieve() .
+     -   **CAP_PROP_MODE** Backend-specific value indicating the current capture mode.
+     -   **CAP_PROP_BRIGHTNESS** Brightness of the image (only for cameras).
+     -   **CAP_PROP_CONTRAST** Contrast of the image (only for cameras).
+     -   **CAP_PROP_SATURATION** Saturation of the image (only for cameras).
+     -   **CAP_PROP_HUE** Hue of the image (only for cameras).
+     -   **CAP_PROP_GAIN** Gain of the image (only for cameras).
+     -   **CAP_PROP_EXPOSURE** Exposure (only for cameras).
+     -   **CAP_PROP_CONVERT_RGB** Boolean flags indicating whether images should be converted
          to RGB.
-     -   **CV_CAP_PROP_WHITE_BALANCE** Currently not supported
-     -   **CV_CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
+     -   **CAP_PROP_WHITE_BALANCE** Currently not supported
+     -   **CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
          by DC1394 v 2.x backend currently)
 
     **Note**: When querying a property that is not supported by the backend used by the VideoCapture

From cca78e0601a128883571f218a81ba5266875caad Mon Sep 17 00:00:00 2001
From: tcavallari <tommaso@tommasocavallari.com>
Date: Tue, 21 Apr 2015 10:08:17 +0200
Subject: [PATCH 35/48] Null Pointer check in vizimpl.cpp

Should fix #4191 (http://code.opencv.org/issues/4191)
---
 modules/viz/src/vizimpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/viz/src/vizimpl.cpp b/modules/viz/src/vizimpl.cpp
index ab621ad16..b2ec7603f 100644
--- a/modules/viz/src/vizimpl.cpp
+++ b/modules/viz/src/vizimpl.cpp
@@ -85,7 +85,7 @@ void cv::viz::Viz3d::VizImpl::TimerCallback::Execute(vtkObject* caller, unsigned
 
 void cv::viz::Viz3d::VizImpl::ExitCallback::Execute(vtkObject*, unsigned long event_id, void*)
 {
-    if (event_id == vtkCommand::ExitEvent)
+    if (event_id == vtkCommand::ExitEvent && viz->interactor_)
     {
         viz->interactor_->TerminateApp();
         viz->interactor_ = 0;

From fa4165342092dad706777ee614b2a913aff3e61a Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Tue, 21 Apr 2015 11:30:41 +0300
Subject: [PATCH 36/48] trying to avoid ABI checker failures

---
 cmake/templates/opencv_abi.xml.in | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/templates/opencv_abi.xml.in b/cmake/templates/opencv_abi.xml.in
index 6a7a6d8d7..a98f86f07 100644
--- a/cmake/templates/opencv_abi.xml.in
+++ b/cmake/templates/opencv_abi.xml.in
@@ -21,6 +21,8 @@
 </libs>
 
 <skip_headers>
+    opencv2/hal/intrin_neon.hpp
+    opencv2/hal/intrin_sse.hpp
     opencv2/core/cuda*
     opencv2/core/private*
     opencv/cxeigen.hpp

From 78eac67a01258a520c4943ede3b72794a689977a Mon Sep 17 00:00:00 2001
From: Pavel Rojtberg <pavel.rojtberg@igd.fraunhofer.de>
Date: Fri, 17 Apr 2015 17:01:17 +0200
Subject: [PATCH 37/48] clean up cvTriangulatePoints by using C++ primitives
 directly. - fixes the Calib3d_StereoCalibrate_C testcase. update comment
 regarding cv::SVD::MODIFY_A

---
 modules/calib3d/src/triangulate.cpp   | 57 +++++++++------------------
 modules/core/include/opencv2/core.hpp |  8 ++--
 2 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/modules/calib3d/src/triangulate.cpp b/modules/calib3d/src/triangulate.cpp
index b0af3dc46..a19f96d10 100644
--- a/modules/calib3d/src/triangulate.cpp
+++ b/modules/calib3d/src/triangulate.cpp
@@ -63,8 +63,7 @@ cvTriangulatePoints(CvMat* projMatr1, CvMat* projMatr2, CvMat* projPoints1, CvMa
       !CV_IS_MAT(points4D) )
       CV_Error( CV_StsUnsupportedFormat, "Input parameters must be matrices" );
 
-    int numPoints;
-    numPoints = projPoints1->cols;
+    int numPoints = projPoints1->cols;
 
     if( numPoints < 1 )
         CV_Error( CV_StsOutOfRange, "Number of points must be more than zero" );
@@ -82,57 +81,39 @@ cvTriangulatePoints(CvMat* projMatr1, CvMat* projMatr2, CvMat* projPoints1, CvMa
        projMatr2->cols != 4 || projMatr2->rows != 3)
         CV_Error( CV_StsUnmatchedSizes, "Size of projection matrices must be 3x4" );
 
-    CvMat matrA;
-    double matrA_dat[24];
-    matrA = cvMat(6,4,CV_64F,matrA_dat);
+    // preallocate SVD matrices on stack
+    cv::Matx<double, 6, 4> matrA;
+    cv::Matx<double, 6, 4> matrU;
+    cv::Matx<double, 4, 1> matrW;
+    cv::Matx<double, 4, 4> matrV;
 
-    //CvMat matrU;
-    CvMat matrW;
-    CvMat matrV;
-    //double matrU_dat[9*9];
-    double matrW_dat[6*4];
-    double matrV_dat[4*4];
-
-    //matrU = cvMat(6,6,CV_64F,matrU_dat);
-    matrW = cvMat(6,4,CV_64F,matrW_dat);
-    matrV = cvMat(4,4,CV_64F,matrV_dat);
-
-    CvMat* projPoints[2];
-    CvMat* projMatrs[2];
-
-    projPoints[0] = projPoints1;
-    projPoints[1] = projPoints2;
-
-    projMatrs[0] = projMatr1;
-    projMatrs[1] = projMatr2;
+    CvMat* projPoints[2] = {projPoints1, projPoints2};
+    CvMat* projMatrs[2] = {projMatr1, projMatr2};
 
     /* Solve system for each point */
-    int i,j;
-    for( i = 0; i < numPoints; i++ )/* For each point */
+    for( int i = 0; i < numPoints; i++ )/* For each point */
     {
         /* Fill matrix for current point */
-        for( j = 0; j < 2; j++ )/* For each view */
+        for( int j = 0; j < 2; j++ )/* For each view */
         {
             double x,y;
             x = cvmGet(projPoints[j],0,i);
             y = cvmGet(projPoints[j],1,i);
             for( int k = 0; k < 4; k++ )
             {
-                cvmSet(&matrA, j*3+0, k, x * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],0,k) );
-                cvmSet(&matrA, j*3+1, k, y * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],1,k) );
-                cvmSet(&matrA, j*3+2, k, x * cvmGet(projMatrs[j],1,k) - y * cvmGet(projMatrs[j],0,k) );
+                matrA(j*3+0, k) = x * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],0,k);
+                matrA(j*3+1, k) = y * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],1,k);
+                matrA(j*3+2, k) = x * cvmGet(projMatrs[j],1,k) - y * cvmGet(projMatrs[j],0,k);
             }
         }
         /* Solve system for current point */
-        {
-            cvSVD(&matrA,&matrW,0,&matrV,CV_SVD_V_T);
+        cv::SVD::compute(matrA, matrW, matrU, matrV);
 
-            /* Copy computed point */
-            cvmSet(points4D,0,i,cvmGet(&matrV,3,0));/* X */
-            cvmSet(points4D,1,i,cvmGet(&matrV,3,1));/* Y */
-            cvmSet(points4D,2,i,cvmGet(&matrV,3,2));/* Z */
-            cvmSet(points4D,3,i,cvmGet(&matrV,3,3));/* W */
-        }
+        /* Copy computed point */
+        cvmSet(points4D,0,i,matrV(3,0));/* X */
+        cvmSet(points4D,1,i,matrV(3,1));/* Y */
+        cvmSet(points4D,2,i,matrV(3,2));/* Z */
+        cvmSet(points4D,3,i,matrV(3,3));/* W */
     }
 
 #if 0
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index a9b4a30b4..e4c61e43a 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -2451,9 +2451,7 @@ matrix. The Singular Value Decomposition is used to solve least-square
 problems, under-determined linear systems, invert matrices, compute
 condition numbers, and so on.
 
-For a faster operation, you can pass flags=SVD::MODIFY_A|... to modify
-the decomposed matrix when it is not necessary to preserve it. If you
-want to compute a condition number of a matrix or an absolute value of
+If you want to compute a condition number of a matrix or an absolute value of
 its determinant, you do not need `u` and `vt`. You can pass
 flags=SVD::NO_UV|... . Another flag SVD::FULL_UV indicates that full-size u
 and vt must be computed, which is not necessary most of the time.
@@ -2464,8 +2462,8 @@ class CV_EXPORTS SVD
 {
 public:
     enum Flags {
-        /** use the algorithm to modify the decomposed matrix; it can save space and speed up
-            processing */
+        /** allow the algorithm to modify the decomposed matrix; it can save space and speed up
+            processing. currently ignored. */
         MODIFY_A = 1,
         /** indicates that only a vector of singular values `w` is to be processed, while u and vt
             will be set to empty matrices */

From 926754a66e59b7af5ea27e87a7193baad944811b Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Tue, 21 Apr 2015 12:42:46 +0300
Subject: [PATCH 38/48] another attempt to resolve ABI checker complains

---
 cmake/templates/opencv_abi.xml.in | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cmake/templates/opencv_abi.xml.in b/cmake/templates/opencv_abi.xml.in
index a98f86f07..292d9b491 100644
--- a/cmake/templates/opencv_abi.xml.in
+++ b/cmake/templates/opencv_abi.xml.in
@@ -21,8 +21,7 @@
 </libs>
 
 <skip_headers>
-    opencv2/hal/intrin_neon.hpp
-    opencv2/hal/intrin_sse.hpp
+    opencv2/hal/intrin*
     opencv2/core/cuda*
     opencv2/core/private*
     opencv/cxeigen.hpp

From 0b20ce696a64e673d5847812224bd881b11459e2 Mon Sep 17 00:00:00 2001
From: Philipp Hasper <PhilLab@users.noreply.github.com>
Date: Wed, 22 Apr 2015 08:08:10 +0200
Subject: [PATCH 39/48] Fixed markdown: **Note** -> @note

---
 doc/tutorials/introduction/biicode/tutorial_biicode.markdown | 4 ++--
 modules/videoio/include/opencv2/videoio.hpp                  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/tutorials/introduction/biicode/tutorial_biicode.markdown b/doc/tutorials/introduction/biicode/tutorial_biicode.markdown
index bade3a660..ce3d26723 100644
--- a/doc/tutorials/introduction/biicode/tutorial_biicode.markdown
+++ b/doc/tutorials/introduction/biicode/tutorial_biicode.markdown
@@ -41,7 +41,7 @@ Windows users also execute:
 $ bii cpp:configure -G "Visual Studio 12"
 @endcode
 
-Now execute ``bii cpp:build`` to build the project. **Note** that this can take a while, until it downloads and builds OpenCV. However, this is downloaded just once in your machine in your "user/.biicode" folder. If the OpenCV installation process fails, you might simply go there, delete OpenCV files inside "user/.biicode" and repeat.
+Now execute ``bii cpp:build`` to build the project. @note This can take a while, until it downloads and builds OpenCV. However, this is downloaded just once in your machine to your "user/.biicode" folder. If the OpenCV installation process fails, you might simply go there, delete OpenCV files inside "user/.biicode" and repeat.
 
 @code{.bash}
 $ bii cpp:build
@@ -137,7 +137,7 @@ replace with:
         diego/opencv(beta): 0
 @endcode
 
-**Note** that the first time you switch to 3.0-beta, it will also take a while to download and build the 3.0-beta release. From that point you can change back and forth between versions, just modifying your *biicode.conf requirements*.
+@note The first time you switch to 3.0-beta, it will also take a while to download and build the 3.0-beta release. From that point on you can change back and forth between versions just by modifying your *biicode.conf requirements*.
 
 Find the hooks and examples:
 * [OpenCV 2.4.10](http://www.biicode.com/diego/opencv)
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index d1d3d7380..b0245b1df 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -581,7 +581,7 @@ public:
      -   **CAP_PROP_RECTIFICATION** Rectification flag for stereo cameras (note: only supported
          by DC1394 v 2.x backend currently)
 
-    **Note**: When querying a property that is not supported by the backend used by the VideoCapture
+    @note When querying a property that is not supported by the backend used by the VideoCapture
     class, value 0 is returned.
      */
     CV_WRAP virtual double get(int propId) const;
@@ -659,7 +659,7 @@ public:
      -   **VIDEOWRITER_PROP_QUALITY** Current quality of the encoded videostream.
      -   **VIDEOWRITER_PROP_FRAMEBYTES** (Read-only) Size of just encoded video frame; note that the encoding order may be different from representation order.
 
-     **Note**: When querying a property that is not supported by the backend used by the VideoWriter
+     @note When querying a property that is not supported by the backend used by the VideoWriter
      class, value 0 is returned.
      */
     CV_WRAP virtual double get(int propId) const;

From 1ea41e7246424c542a15186cff26d995759d82ab Mon Sep 17 00:00:00 2001
From: Pavel Rojtberg <pavel.rojtberg@igd.fraunhofer.de>
Date: Wed, 22 Apr 2015 16:13:50 +0200
Subject: [PATCH 40/48] fix gftt opencv kernel when using mask

---
 modules/imgproc/src/opencl/gftt.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/imgproc/src/opencl/gftt.cl b/modules/imgproc/src/opencl/gftt.cl
index 584ab41af..736802b1b 100644
--- a/modules/imgproc/src/opencl/gftt.cl
+++ b/modules/imgproc/src/opencl/gftt.cl
@@ -64,7 +64,7 @@ __kernel void maxEigenVal(__global const uchar * srcptr, int src_step, int src_o
         int src_index = mad24(id / cols, src_step, mad24((id % cols), (int)sizeof(float), src_offset));
 #ifdef HAVE_MASK
         int mask_index = mad24(id / cols, mask_step, id % cols + mask_offset);
-        if (mask[mask_index])
+        if (maskptr[mask_index])
 #endif
             maxval = max(maxval, *(__global const float *)(srcptr + src_index));
     }

From 08925330e0a6dc3cacb5c62afd50d1803f2e71d2 Mon Sep 17 00:00:00 2001
From: kalistratovag <alexander.kalistratov@itseez.com>
Date: Wed, 22 Apr 2015 19:49:11 +0300
Subject: [PATCH 41/48] mjpec_decoder initialize m_movi_start variable

---
 modules/videoio/src/cap_mjpeg_decoder.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/videoio/src/cap_mjpeg_decoder.cpp b/modules/videoio/src/cap_mjpeg_decoder.cpp
index c6eb8f00d..53557fa9d 100644
--- a/modules/videoio/src/cap_mjpeg_decoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_decoder.cpp
@@ -344,7 +344,7 @@ class AviMjpegStream
 {
 public:
     AviMjpegStream();
-    //stores founded frames in m_frame_list which be accessed via getFrames
+    //stores founded frames in m_frame_list which can be accessed via getFrames
     bool parseAvi(MjpegInputStream& in_str);
     //stores founded frames in in_frame_list. getFrames() would return empty list
     bool parseAvi(MjpegInputStream& in_str, frame_list& in_frame_list);
@@ -377,7 +377,7 @@ protected:
     bool       m_is_indx_present;
 };
 
-AviMjpegStream::AviMjpegStream(): m_stream_id(0), m_movi_end(0), m_width(0), m_height(0), m_fps(0), m_is_indx_present(false)
+AviMjpegStream::AviMjpegStream(): m_stream_id(0), m_movi_start(0), m_movi_end(0), m_width(0), m_height(0), m_fps(0), m_is_indx_present(false)
 {
 }
 

From 2b2e3ee93c6788b892b084cc04d0ead5ab75c197 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Wed, 22 Apr 2015 21:57:29 +0300
Subject: [PATCH 42/48] make sure opencv builds fine with the recent versions
 of libav (e.g. from Ubuntu 14.10)

---
 3rdparty/ffmpeg/ffmpeg_version.cmake    |   2 +
 CMakeLists.txt                          |   1 +
 cmake/OpenCVFindLibsVideo.cmake         |  10 +-
 modules/videoio/src/cap.cpp             |   4 +-
 modules/videoio/src/cap_ffmpeg_impl.hpp |  44 +++-
 modules/videoio/src/ffmpeg_codecs.hpp   | 321 ++++++++++++++----------
 modules/videoio/test/test_ffmpeg.cpp    |   1 +
 7 files changed, 241 insertions(+), 142 deletions(-)

diff --git a/3rdparty/ffmpeg/ffmpeg_version.cmake b/3rdparty/ffmpeg/ffmpeg_version.cmake
index a3c78b2fc..48fba2b91 100644
--- a/3rdparty/ffmpeg/ffmpeg_version.cmake
+++ b/3rdparty/ffmpeg/ffmpeg_version.cmake
@@ -3,9 +3,11 @@ set(HAVE_FFMPEG_CODEC 1)
 set(HAVE_FFMPEG_FORMAT 1)
 set(HAVE_FFMPEG_UTIL 1)
 set(HAVE_FFMPEG_SWSCALE 1)
+set(HAVE_FFMPEG_RESAMPLE 0)
 set(HAVE_GENTOO_FFMPEG 1)
 
 set(ALIASOF_libavcodec_VERSION 55.18.102)
 set(ALIASOF_libavformat_VERSION 55.12.100)
 set(ALIASOF_libavutil_VERSION 52.38.100)
 set(ALIASOF_libswscale_VERSION 2.3.100)
+set(ALIASOF_libavresample_VERSION 1.0.1)
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a25987cdb..a02b71a8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -931,6 +931,7 @@ if(DEFINED WITH_FFMPEG)
   status("      format:"       HAVE_FFMPEG_FORMAT  THEN "YES (ver ${ALIASOF_libavformat_VERSION})" ELSE NO)
   status("      util:"         HAVE_FFMPEG_UTIL    THEN "YES (ver ${ALIASOF_libavutil_VERSION})"   ELSE NO)
   status("      swscale:"      HAVE_FFMPEG_SWSCALE THEN "YES (ver ${ALIASOF_libswscale_VERSION})"  ELSE NO)
+  status("      resample:"     HAVE_FFMPEG_RESAMPLE THEN "YES (ver ${ALIASOF_libavresample_VERSION})"  ELSE NO)
   status("      gentoo-style:" HAVE_GENTOO_FFMPEG  THEN YES                                        ELSE NO)
 endif(DEFINED WITH_FFMPEG)
 
diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake
index 1443c62ca..af46ac508 100644
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@@ -187,7 +187,7 @@ if(WITH_XIMEA)
 endif(WITH_XIMEA)
 
 # --- FFMPEG ---
-ocv_clear_vars(HAVE_FFMPEG HAVE_FFMPEG_CODEC HAVE_FFMPEG_FORMAT HAVE_FFMPEG_UTIL HAVE_FFMPEG_SWSCALE HAVE_GENTOO_FFMPEG HAVE_FFMPEG_FFMPEG)
+ocv_clear_vars(HAVE_FFMPEG HAVE_FFMPEG_CODEC HAVE_FFMPEG_FORMAT HAVE_FFMPEG_UTIL HAVE_FFMPEG_SWSCALE HAVE_FFMPEG_RESAMPLE HAVE_GENTOO_FFMPEG HAVE_FFMPEG_FFMPEG)
 if(WITH_FFMPEG)
   if(WIN32 AND NOT ARM)
     include("${OpenCV_SOURCE_DIR}/3rdparty/ffmpeg/ffmpeg_version.cmake")
@@ -196,6 +196,7 @@ if(WITH_FFMPEG)
     CHECK_MODULE(libavformat HAVE_FFMPEG_FORMAT)
     CHECK_MODULE(libavutil HAVE_FFMPEG_UTIL)
     CHECK_MODULE(libswscale HAVE_FFMPEG_SWSCALE)
+    CHECK_MODULE(libavresample HAVE_FFMPEG_RESAMPLE)
 
     CHECK_INCLUDE_FILE(libavformat/avformat.h HAVE_GENTOO_FFMPEG)
     CHECK_INCLUDE_FILE(ffmpeg/avformat.h HAVE_FFMPEG_FFMPEG)
@@ -239,6 +240,10 @@ if(WITH_FFMPEG)
               set(HAVE_FFMPEG_SWSCALE 1)
               set(ALIASOF_libswscale_VERSION "Unknown")
               set(HAVE_FFMPEG 1)
+              if(EXISTS "${FFMPEG_LIB_DIR}/libavresample.a")
+                  set(HAVE_FFMPEG_RESAMPLE 1)
+                  set(ALIASOF_libavresample_VERSION "Unknown")
+              endif()
             endif()
           endif()
         endif()
@@ -248,6 +253,9 @@ if(WITH_FFMPEG)
       set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES} "${FFMPEG_LIB_DIR}/libavcodec.a"
           "${FFMPEG_LIB_DIR}/libavformat.a" "${FFMPEG_LIB_DIR}/libavutil.a"
           "${FFMPEG_LIB_DIR}/libswscale.a")
+      if(HAVE_FFMPEG_RESAMPLE)
+          set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES} "${FFMPEG_LIB_DIR}/libavresample.a")
+      endif()
       ocv_include_directories(${FFMPEG_INCLUDE_DIR})
     endif()
   endif(APPLE)
diff --git a/modules/videoio/src/cap.cpp b/modules/videoio/src/cap.cpp
index 1fedb0b66..3a6d413a3 100644
--- a/modules/videoio/src/cap.cpp
+++ b/modules/videoio/src/cap.cpp
@@ -552,11 +552,11 @@ static Ptr<IVideoCapture> IVideoCapture_create(int index)
 }
 
 
-static Ptr<IVideoCapture> IVideoCapture_create(const String& filename)
+static Ptr<IVideoCapture> IVideoCapture_create(const String&)
 {
     Ptr<IVideoCapture> capture;
 
-    capture = createMotionJpegCapture(filename);
+    //capture = createMotionJpegCapture(filename);
     if (capture && capture->isOpened())
     {
         return capture;
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index c57e3c0e4..06abd0734 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -786,7 +786,9 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
     case CV_FFMPEG_CAP_PROP_FRAME_HEIGHT:
         return (double)frame.height;
     case CV_FFMPEG_CAP_PROP_FPS:
-#if LIBAVCODEC_BUILD > 4753
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0)
+        return av_q2d(video_st->avg_frame_rate);
+#elif LIBAVCODEC_BUILD > 4753
         return av_q2d(video_st->r_frame_rate);
 #else
         return (double)video_st->codec.frame_rate
@@ -834,7 +836,11 @@ int CvCapture_FFMPEG::get_bitrate() const
 
 double CvCapture_FFMPEG::get_fps() const
 {
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0)
+    double fps = r2d(ic->streams[video_stream]->avg_frame_rate);
+#else
     double fps = r2d(ic->streams[video_stream]->r_frame_rate);
+#endif
 
 #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(52, 111, 0)
     if (fps < eps_zero)
@@ -1223,7 +1229,7 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
      and qmin since they will be set to reasonable defaults by the libx264
      preset system. Also, use a crf encode with the default quality rating,
      this seems easier than finding an appropriate default bitrate. */
-    if (c->codec_id == CODEC_ID_H264) {
+    if (c->codec_id == AV_CODEC_ID_H264) {
       c->gop_size = -1;
       c->qmin = -1;
       c->bit_rate = 0;
@@ -1251,8 +1257,7 @@ static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
 #else
     AVCodecContext * c = &(video_st->codec);
 #endif
-    int out_size;
-    int ret = 0;
+    int ret = OPENCV_NO_FRAMES_WRITTEN_CODE;
 
     if (oc->oformat->flags & AVFMT_RAWPICTURE) {
         /* raw video case. The API will change slightly in the near
@@ -1272,12 +1277,32 @@ static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
         ret = av_write_frame(oc, &pkt);
     } else {
         /* encode the image */
-        out_size = avcodec_encode_video(c, outbuf, outbuf_size, picture);
+        AVPacket pkt;
+        av_init_packet(&pkt);
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0)
+        int got_output = 0;
+        pkt.data = NULL;
+        pkt.size = 0;
+        ret = avcodec_encode_video2(c, &pkt, picture, &got_output);
+        if (ret < 0)
+            got_output = 0;
+        else if (got_output) {
+            //if (c->coded_frame->pts != (int64_t)AV_NOPTS_VALUE)
+            //    pkt.pts = av_rescale_q(c->coded_frame->pts, c->time_base, video_st->time_base);
+            //if (c->coded_frame->dts != (int64_t)AV_NOPTS_VALUE)
+            //    pkt.dts = av_rescale_q(c->coded_frame->dts, c->time_base, video_st->time_base);
+            //if (pkt.duration)
+            //    pkt.duration = av_rescale_q(pkt.duration, c->time_base, video_st->time_base);
+            pkt.stream_index= video_st->index;
+            ret = av_write_frame(oc, &pkt);
+            av_free_packet(&pkt);
+        }
+        else
+            ret = OPENCV_NO_FRAMES_WRITTEN_CODE;
+#else
+        int out_size = avcodec_encode_video(c, outbuf, outbuf_size, picture);
         /* if zero size, it means the image was buffered */
         if (out_size > 0) {
-            AVPacket pkt;
-            av_init_packet(&pkt);
-
 #if LIBAVFORMAT_BUILD > 4752
             if(c->coded_frame->pts != (int64_t)AV_NOPTS_VALUE)
                 pkt.pts = av_rescale_q(c->coded_frame->pts, c->time_base, video_st->time_base);
@@ -1292,9 +1317,8 @@ static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
 
             /* write the compressed frame in the media file */
             ret = av_write_frame(oc, &pkt);
-        } else {
-            ret = OPENCV_NO_FRAMES_WRITTEN_CODE;
         }
+#endif
     }
     return ret;
 }
diff --git a/modules/videoio/src/ffmpeg_codecs.hpp b/modules/videoio/src/ffmpeg_codecs.hpp
index 5bdd4cd22..aa3e3bfc6 100644
--- a/modules/videoio/src/ffmpeg_codecs.hpp
+++ b/modules/videoio/src/ffmpeg_codecs.hpp
@@ -94,160 +94,223 @@ typedef struct AVCodecTag {
     unsigned int tag;
 } AVCodecTag;
 
+#if (LIBAVCODEC_VERSION_INT <= AV_VERSION_INT(54, 51, 100))
+#define AV_CODEC_ID_H264 CODEC_ID_H264
+#define AV_CODEC_ID_H263 CODEC_ID_H263
+#define AV_CODEC_ID_H263P CODEC_ID_H263P
+#define AV_CODEC_ID_H263 CODEC_ID_H261
+#define AV_CODEC_ID_MPEG4 CODEC_ID_MPEG4
+#define AV_CODEC_ID_MSMPEG4V3 CODEC_ID_MSMPEG4V3
+#define AV_CODEC_ID_MSMPEG4V2 CODEC_ID_MSMPEG4V2
+#define AV_CODEC_ID_MSMPEG4V1 CODEC_ID_MSMPEG4V1
+#define AV_CODEC_ID_WMV1 CODEC_ID_WMV1
+#define AV_CODEC_ID_WMV2 CODEC_ID_WMV1
+#define AV_CODEC_ID_DVVIDEO CODEC_ID_DVVIDEO
+#define AV_CODEC_ID_MPEG1VIDEO CODEC_ID_MPEG1VIDEO
+#define AV_CODEC_ID_MPEG2VIDEO CODEC_ID_MPEG2VIDEO
+#define AV_CODEC_ID_MJPEG CODEC_ID_MJPEG
+#define AV_CODEC_ID_LJPEG CODEC_ID_LJPEG
+#define AV_CODEC_ID_HUFFYUV CODEC_ID_HUFFYUV
+#define AV_CODEC_ID_FFVHUFF CODEC_ID_FFVHUFF
+#define AV_CODEC_ID_CYUV CODEC_ID_CYUV
+#define AV_CODEC_ID_RAWVIDEO CODEC_ID_RAWVIDEO
+#define AV_CODEC_ID_INDEO3 CODEC_ID_INDEO3
+#define AV_CODEC_ID_VP3 CODEC_ID_VP3
+#define AV_CODEC_ID_ASV1 CODEC_ID_ASV1
+#define AV_CODEC_ID_ASV2 CODEC_ID_ASV2
+#define AV_CODEC_ID_VCR1 CODEC_ID_VCR1
+#define AV_CODEC_ID_FFV1 CODEC_ID_FFV1
+#define AV_CODEC_ID_XAN_WC4 CODEC_ID_XAN_WC4
+#define AV_CODEC_ID_MSRLE CODEC_ID_MSRLE
+#define AV_CODEC_ID_MSVIDEO1 CODEC_ID_MSVIDEO1
+#define AV_CODEC_ID_CINEPAK CODEC_ID_CINEPAK
+#define AV_CODEC_ID_TRUEMOTION1 CODEC_ID_TRUEMOTION1
+#define AV_CODEC_ID_MSZH CODEC_ID_MSZH
+#define AV_CODEC_ID_ZLIB CODEC_ID_ZLIB
+#define AV_CODEC_ID_SNOW CODEC_ID_SNOW
+#define AV_CODEC_ID_4XM CODEC_ID_4XM
+#define AV_CODEC_ID_FLV1 CODEC_ID_FLV1
+#define AV_CODEC_ID_SVQ1 CODEC_ID_SVQ1
+#define AV_CODEC_ID_TSCC CODEC_ID_TSCC
+#define AV_CODEC_ID_ULTI CODEC_ID_ULTI
+#define AV_CODEC_ID_VIXL CODEC_ID_VIXL
+#define AV_CODEC_ID_QPEG CODEC_ID_QPEG
+#define AV_CODEC_ID_WMV3 CODEC_ID_WMV3
+#define AV_CODEC_ID_LOCO CODEC_ID_LOCO
+#define AV_CODEC_ID_THEORA CODEC_ID_THEORA
+#define AV_CODEC_ID_WNV CODEC_ID_WNV
+#define AV_CODEC_ID_AASC CODEC_ID_AASC
+#define AV_CODEC_ID_INDEO2 CODEC_ID_INDEO2
+#define AV_CODEC_ID_FRAPS CODEC_ID_FRAPS
+#define AV_CODEC_ID_TRUEMOTION2 CODEC_ID_TRUEMOTION2
+#define AV_CODEC_ID_FLASHSV CODEC_ID_FLASHSV
+#define AV_CODEC_ID_JPEGLS CODEC_ID_JPEGLS
+#define AV_CODEC_ID_VC1 CODEC_ID_VC1
+#define AV_CODEC_ID_CSCD CODEC_ID_CSCD
+#define AV_CODEC_ID_ZMBV CODEC_ID_ZMBV
+#define AV_CODEC_ID_KMVC CODEC_ID_KMVC
+#define AV_CODEC_ID_VP5 CODEC_ID_VP5
+#define AV_CODEC_ID_VP6 CODEC_ID_VP6
+#define AV_CODEC_ID_VP6F CODEC_ID_VP6F
+#define AV_CODEC_ID_JPEG2000 CODEC_ID_JPEG2000
+#define AV_CODEC_ID_VMNC CODEC_ID_VMNC
+#define AV_CODEC_ID_TARGA CODEC_ID_TARGA
+#define AV_CODEC_ID_NONE CODEC_ID_NONE
+#endif
+
 const AVCodecTag codec_bmp_tags[] = {
-    { CODEC_ID_H264, MKTAG('H', '2', '6', '4') },
-    { CODEC_ID_H264, MKTAG('h', '2', '6', '4') },
-    { CODEC_ID_H264, MKTAG('X', '2', '6', '4') },
-    { CODEC_ID_H264, MKTAG('x', '2', '6', '4') },
-    { CODEC_ID_H264, MKTAG('a', 'v', 'c', '1') },
-    { CODEC_ID_H264, MKTAG('V', 'S', 'S', 'H') },
+    { AV_CODEC_ID_H264, MKTAG('H', '2', '6', '4') },
+    { AV_CODEC_ID_H264, MKTAG('h', '2', '6', '4') },
+    { AV_CODEC_ID_H264, MKTAG('X', '2', '6', '4') },
+    { AV_CODEC_ID_H264, MKTAG('x', '2', '6', '4') },
+    { AV_CODEC_ID_H264, MKTAG('a', 'v', 'c', '1') },
+    { AV_CODEC_ID_H264, MKTAG('V', 'S', 'S', 'H') },
 
-    { CODEC_ID_H263, MKTAG('H', '2', '6', '3') },
-    { CODEC_ID_H263P, MKTAG('H', '2', '6', '3') },
-    { CODEC_ID_H263I, MKTAG('I', '2', '6', '3') }, /* intel h263 */
-    { CODEC_ID_H261, MKTAG('H', '2', '6', '1') },
+    { AV_CODEC_ID_H263, MKTAG('H', '2', '6', '3') },
+    { AV_CODEC_ID_H263P, MKTAG('H', '2', '6', '3') },
+    { AV_CODEC_ID_H263I, MKTAG('I', '2', '6', '3') }, /* intel h263 */
+    { AV_CODEC_ID_H261, MKTAG('H', '2', '6', '1') },
 
     /* added based on MPlayer */
-    { CODEC_ID_H263P, MKTAG('U', '2', '6', '3') },
-    { CODEC_ID_H263P, MKTAG('v', 'i', 'v', '1') },
+    { AV_CODEC_ID_H263P, MKTAG('U', '2', '6', '3') },
+    { AV_CODEC_ID_H263P, MKTAG('v', 'i', 'v', '1') },
 
-    { CODEC_ID_MPEG4, MKTAG('F', 'M', 'P', '4') },
-    { CODEC_ID_MPEG4, MKTAG('D', 'I', 'V', 'X') },
-    { CODEC_ID_MPEG4, MKTAG('D', 'X', '5', '0') },
-    { CODEC_ID_MPEG4, MKTAG('X', 'V', 'I', 'D') },
-    { CODEC_ID_MPEG4, MKTAG('M', 'P', '4', 'S') },
-    { CODEC_ID_MPEG4, MKTAG('M', '4', 'S', '2') },
-    { CODEC_ID_MPEG4, MKTAG(0x04, 0, 0, 0) }, /* some broken avi use this */
+    { AV_CODEC_ID_MPEG4, MKTAG('F', 'M', 'P', '4') },
+    { AV_CODEC_ID_MPEG4, MKTAG('D', 'I', 'V', 'X') },
+    { AV_CODEC_ID_MPEG4, MKTAG('D', 'X', '5', '0') },
+    { AV_CODEC_ID_MPEG4, MKTAG('X', 'V', 'I', 'D') },
+    { AV_CODEC_ID_MPEG4, MKTAG('M', 'P', '4', 'S') },
+    { AV_CODEC_ID_MPEG4, MKTAG('M', '4', 'S', '2') },
+    { AV_CODEC_ID_MPEG4, MKTAG(0x04, 0, 0, 0) }, /* some broken avi use this */
 
     /* added based on MPlayer */
-    { CODEC_ID_MPEG4, MKTAG('D', 'I', 'V', '1') },
-    { CODEC_ID_MPEG4, MKTAG('B', 'L', 'Z', '0') },
-    { CODEC_ID_MPEG4, MKTAG('m', 'p', '4', 'v') },
-    { CODEC_ID_MPEG4, MKTAG('U', 'M', 'P', '4') },
-    { CODEC_ID_MPEG4, MKTAG('W', 'V', '1', 'F') },
-    { CODEC_ID_MPEG4, MKTAG('S', 'E', 'D', 'G') },
+    { AV_CODEC_ID_MPEG4, MKTAG('D', 'I', 'V', '1') },
+    { AV_CODEC_ID_MPEG4, MKTAG('B', 'L', 'Z', '0') },
+    { AV_CODEC_ID_MPEG4, MKTAG('m', 'p', '4', 'v') },
+    { AV_CODEC_ID_MPEG4, MKTAG('U', 'M', 'P', '4') },
+    { AV_CODEC_ID_MPEG4, MKTAG('W', 'V', '1', 'F') },
+    { AV_CODEC_ID_MPEG4, MKTAG('S', 'E', 'D', 'G') },
 
-    { CODEC_ID_MPEG4, MKTAG('R', 'M', 'P', '4') },
+    { AV_CODEC_ID_MPEG4, MKTAG('R', 'M', 'P', '4') },
 
-    { CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '3') }, /* default signature when using MSMPEG4 */
-    { CODEC_ID_MSMPEG4V3, MKTAG('M', 'P', '4', '3') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '3') }, /* default signature when using MSMPEG4 */
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('M', 'P', '4', '3') },
 
     /* added based on MPlayer */
-    { CODEC_ID_MSMPEG4V3, MKTAG('M', 'P', 'G', '3') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '5') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '6') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '4') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('A', 'P', '4', '1') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('C', 'O', 'L', '1') },
-    { CODEC_ID_MSMPEG4V3, MKTAG('C', 'O', 'L', '0') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('M', 'P', 'G', '3') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '5') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '6') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('D', 'I', 'V', '4') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('A', 'P', '4', '1') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('C', 'O', 'L', '1') },
+    { AV_CODEC_ID_MSMPEG4V3, MKTAG('C', 'O', 'L', '0') },
 
-    { CODEC_ID_MSMPEG4V2, MKTAG('M', 'P', '4', '2') },
+    { AV_CODEC_ID_MSMPEG4V2, MKTAG('M', 'P', '4', '2') },
 
     /* added based on MPlayer */
-    { CODEC_ID_MSMPEG4V2, MKTAG('D', 'I', 'V', '2') },
+    { AV_CODEC_ID_MSMPEG4V2, MKTAG('D', 'I', 'V', '2') },
 
-    { CODEC_ID_MSMPEG4V1, MKTAG('M', 'P', 'G', '4') },
+    { AV_CODEC_ID_MSMPEG4V1, MKTAG('M', 'P', 'G', '4') },
 
-    { CODEC_ID_WMV1, MKTAG('W', 'M', 'V', '1') },
+    { AV_CODEC_ID_WMV1, MKTAG('W', 'M', 'V', '1') },
 
     /* added based on MPlayer */
-    { CODEC_ID_WMV2, MKTAG('W', 'M', 'V', '2') },
-    { CODEC_ID_DVVIDEO, MKTAG('d', 'v', 's', 'd') },
-    { CODEC_ID_DVVIDEO, MKTAG('d', 'v', 'h', 'd') },
-    { CODEC_ID_DVVIDEO, MKTAG('d', 'v', 's', 'l') },
-    { CODEC_ID_DVVIDEO, MKTAG('d', 'v', '2', '5') },
-    { CODEC_ID_MPEG1VIDEO, MKTAG('m', 'p', 'g', '1') },
-    { CODEC_ID_MPEG1VIDEO, MKTAG('m', 'p', 'g', '2') },
-    { CODEC_ID_MPEG2VIDEO, MKTAG('m', 'p', 'g', '2') },
-    { CODEC_ID_MPEG2VIDEO, MKTAG('M', 'P', 'E', 'G') },
-    { CODEC_ID_MPEG1VIDEO, MKTAG('P', 'I', 'M', '1') },
-    { CODEC_ID_MPEG1VIDEO, MKTAG('V', 'C', 'R', '2') },
-    { CODEC_ID_MPEG1VIDEO, 0x10000001 },
-    { CODEC_ID_MPEG2VIDEO, 0x10000002 },
-    { CODEC_ID_MPEG2VIDEO, MKTAG('D', 'V', 'R', ' ') },
-    { CODEC_ID_MPEG2VIDEO, MKTAG('M', 'M', 'E', 'S') },
-    { CODEC_ID_MJPEG, MKTAG('M', 'J', 'P', 'G') },
-    { CODEC_ID_MJPEG, MKTAG('L', 'J', 'P', 'G') },
-    { CODEC_ID_LJPEG, MKTAG('L', 'J', 'P', 'G') },
-    { CODEC_ID_MJPEG, MKTAG('J', 'P', 'G', 'L') }, /* Pegasus lossless JPEG */
-    { CODEC_ID_MJPEG, MKTAG('M', 'J', 'L', 'S') }, /* JPEG-LS custom FOURCC for avi - decoder */
-    { CODEC_ID_MJPEG, MKTAG('j', 'p', 'e', 'g') },
-    { CODEC_ID_MJPEG, MKTAG('I', 'J', 'P', 'G') },
-    { CODEC_ID_MJPEG, MKTAG('A', 'V', 'R', 'n') },
-    { CODEC_ID_HUFFYUV, MKTAG('H', 'F', 'Y', 'U') },
-    { CODEC_ID_FFVHUFF, MKTAG('F', 'F', 'V', 'H') },
-    { CODEC_ID_CYUV, MKTAG('C', 'Y', 'U', 'V') },
-    { CODEC_ID_RAWVIDEO, 0 },
-    { CODEC_ID_RAWVIDEO, MKTAG('I', '4', '2', '0') },
-    { CODEC_ID_RAWVIDEO, MKTAG('Y', 'U', 'Y', '2') },
-    { CODEC_ID_RAWVIDEO, MKTAG('Y', '4', '2', '2') },
-    { CODEC_ID_RAWVIDEO, MKTAG('Y', 'V', '1', '2') },
-    { CODEC_ID_RAWVIDEO, MKTAG('U', 'Y', 'V', 'Y') },
-    { CODEC_ID_RAWVIDEO, MKTAG('I', 'Y', 'U', 'V') },
-    { CODEC_ID_RAWVIDEO, MKTAG('Y', '8', '0', '0') },
-    { CODEC_ID_RAWVIDEO, MKTAG('H', 'D', 'Y', 'C') },
-    { CODEC_ID_INDEO3, MKTAG('I', 'V', '3', '1') },
-    { CODEC_ID_INDEO3, MKTAG('I', 'V', '3', '2') },
-    { CODEC_ID_VP3, MKTAG('V', 'P', '3', '1') },
-    { CODEC_ID_VP3, MKTAG('V', 'P', '3', '0') },
-    { CODEC_ID_ASV1, MKTAG('A', 'S', 'V', '1') },
-    { CODEC_ID_ASV2, MKTAG('A', 'S', 'V', '2') },
-    { CODEC_ID_VCR1, MKTAG('V', 'C', 'R', '1') },
-    { CODEC_ID_FFV1, MKTAG('F', 'F', 'V', '1') },
-    { CODEC_ID_XAN_WC4, MKTAG('X', 'x', 'a', 'n') },
-    { CODEC_ID_MSRLE, MKTAG('m', 'r', 'l', 'e') },
-    { CODEC_ID_MSRLE, MKTAG(0x1, 0x0, 0x0, 0x0) },
-    { CODEC_ID_MSVIDEO1, MKTAG('M', 'S', 'V', 'C') },
-    { CODEC_ID_MSVIDEO1, MKTAG('m', 's', 'v', 'c') },
-    { CODEC_ID_MSVIDEO1, MKTAG('C', 'R', 'A', 'M') },
-    { CODEC_ID_MSVIDEO1, MKTAG('c', 'r', 'a', 'm') },
-    { CODEC_ID_MSVIDEO1, MKTAG('W', 'H', 'A', 'M') },
-    { CODEC_ID_MSVIDEO1, MKTAG('w', 'h', 'a', 'm') },
-    { CODEC_ID_CINEPAK, MKTAG('c', 'v', 'i', 'd') },
-    { CODEC_ID_TRUEMOTION1, MKTAG('D', 'U', 'C', 'K') },
-    { CODEC_ID_MSZH, MKTAG('M', 'S', 'Z', 'H') },
-    { CODEC_ID_ZLIB, MKTAG('Z', 'L', 'I', 'B') },
-    { CODEC_ID_SNOW, MKTAG('S', 'N', 'O', 'W') },
-    { CODEC_ID_4XM, MKTAG('4', 'X', 'M', 'V') },
-    { CODEC_ID_FLV1, MKTAG('F', 'L', 'V', '1') },
-    { CODEC_ID_SVQ1, MKTAG('s', 'v', 'q', '1') },
-    { CODEC_ID_TSCC, MKTAG('t', 's', 'c', 'c') },
-    { CODEC_ID_ULTI, MKTAG('U', 'L', 'T', 'I') },
-    { CODEC_ID_VIXL, MKTAG('V', 'I', 'X', 'L') },
-    { CODEC_ID_QPEG, MKTAG('Q', 'P', 'E', 'G') },
-    { CODEC_ID_QPEG, MKTAG('Q', '1', '.', '0') },
-    { CODEC_ID_QPEG, MKTAG('Q', '1', '.', '1') },
-    { CODEC_ID_WMV3, MKTAG('W', 'M', 'V', '3') },
-    { CODEC_ID_LOCO, MKTAG('L', 'O', 'C', 'O') },
-    { CODEC_ID_THEORA, MKTAG('t', 'h', 'e', 'o') },
+    { AV_CODEC_ID_WMV2, MKTAG('W', 'M', 'V', '2') },
+    { AV_CODEC_ID_DVVIDEO, MKTAG('d', 'v', 's', 'd') },
+    { AV_CODEC_ID_DVVIDEO, MKTAG('d', 'v', 'h', 'd') },
+    { AV_CODEC_ID_DVVIDEO, MKTAG('d', 'v', 's', 'l') },
+    { AV_CODEC_ID_DVVIDEO, MKTAG('d', 'v', '2', '5') },
+    { AV_CODEC_ID_MPEG1VIDEO, MKTAG('m', 'p', 'g', '1') },
+    { AV_CODEC_ID_MPEG1VIDEO, MKTAG('m', 'p', 'g', '2') },
+    { AV_CODEC_ID_MPEG2VIDEO, MKTAG('m', 'p', 'g', '2') },
+    { AV_CODEC_ID_MPEG2VIDEO, MKTAG('M', 'P', 'E', 'G') },
+    { AV_CODEC_ID_MPEG1VIDEO, MKTAG('P', 'I', 'M', '1') },
+    { AV_CODEC_ID_MPEG1VIDEO, MKTAG('V', 'C', 'R', '2') },
+    { AV_CODEC_ID_MPEG1VIDEO, 0x10000001 },
+    { AV_CODEC_ID_MPEG2VIDEO, 0x10000002 },
+    { AV_CODEC_ID_MPEG2VIDEO, MKTAG('D', 'V', 'R', ' ') },
+    { AV_CODEC_ID_MPEG2VIDEO, MKTAG('M', 'M', 'E', 'S') },
+    { AV_CODEC_ID_MJPEG, MKTAG('M', 'J', 'P', 'G') },
+    { AV_CODEC_ID_MJPEG, MKTAG('L', 'J', 'P', 'G') },
+    { AV_CODEC_ID_LJPEG, MKTAG('L', 'J', 'P', 'G') },
+    { AV_CODEC_ID_MJPEG, MKTAG('J', 'P', 'G', 'L') }, /* Pegasus lossless JPEG */
+    { AV_CODEC_ID_MJPEG, MKTAG('M', 'J', 'L', 'S') }, /* JPEG-LS custom FOURCC for avi - decoder */
+    { AV_CODEC_ID_MJPEG, MKTAG('j', 'p', 'e', 'g') },
+    { AV_CODEC_ID_MJPEG, MKTAG('I', 'J', 'P', 'G') },
+    { AV_CODEC_ID_MJPEG, MKTAG('A', 'V', 'R', 'n') },
+    { AV_CODEC_ID_HUFFYUV, MKTAG('H', 'F', 'Y', 'U') },
+    { AV_CODEC_ID_FFVHUFF, MKTAG('F', 'F', 'V', 'H') },
+    { AV_CODEC_ID_CYUV, MKTAG('C', 'Y', 'U', 'V') },
+    { AV_CODEC_ID_RAWVIDEO, 0 },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('I', '4', '2', '0') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('Y', 'U', 'Y', '2') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('Y', '4', '2', '2') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('Y', 'V', '1', '2') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('U', 'Y', 'V', 'Y') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('I', 'Y', 'U', 'V') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('Y', '8', '0', '0') },
+    { AV_CODEC_ID_RAWVIDEO, MKTAG('H', 'D', 'Y', 'C') },
+    { AV_CODEC_ID_INDEO3, MKTAG('I', 'V', '3', '1') },
+    { AV_CODEC_ID_INDEO3, MKTAG('I', 'V', '3', '2') },
+    { AV_CODEC_ID_VP3, MKTAG('V', 'P', '3', '1') },
+    { AV_CODEC_ID_VP3, MKTAG('V', 'P', '3', '0') },
+    { AV_CODEC_ID_ASV1, MKTAG('A', 'S', 'V', '1') },
+    { AV_CODEC_ID_ASV2, MKTAG('A', 'S', 'V', '2') },
+    { AV_CODEC_ID_VCR1, MKTAG('V', 'C', 'R', '1') },
+    { AV_CODEC_ID_FFV1, MKTAG('F', 'F', 'V', '1') },
+    { AV_CODEC_ID_XAN_WC4, MKTAG('X', 'x', 'a', 'n') },
+    { AV_CODEC_ID_MSRLE, MKTAG('m', 'r', 'l', 'e') },
+    { AV_CODEC_ID_MSRLE, MKTAG(0x1, 0x0, 0x0, 0x0) },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('M', 'S', 'V', 'C') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('m', 's', 'v', 'c') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('C', 'R', 'A', 'M') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('c', 'r', 'a', 'm') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('W', 'H', 'A', 'M') },
+    { AV_CODEC_ID_MSVIDEO1, MKTAG('w', 'h', 'a', 'm') },
+    { AV_CODEC_ID_CINEPAK, MKTAG('c', 'v', 'i', 'd') },
+    { AV_CODEC_ID_TRUEMOTION1, MKTAG('D', 'U', 'C', 'K') },
+    { AV_CODEC_ID_MSZH, MKTAG('M', 'S', 'Z', 'H') },
+    { AV_CODEC_ID_ZLIB, MKTAG('Z', 'L', 'I', 'B') },
+    { AV_CODEC_ID_4XM, MKTAG('4', 'X', 'M', 'V') },
+    { AV_CODEC_ID_FLV1, MKTAG('F', 'L', 'V', '1') },
+    { AV_CODEC_ID_SVQ1, MKTAG('s', 'v', 'q', '1') },
+    { AV_CODEC_ID_TSCC, MKTAG('t', 's', 'c', 'c') },
+    { AV_CODEC_ID_ULTI, MKTAG('U', 'L', 'T', 'I') },
+    { AV_CODEC_ID_VIXL, MKTAG('V', 'I', 'X', 'L') },
+    { AV_CODEC_ID_QPEG, MKTAG('Q', 'P', 'E', 'G') },
+    { AV_CODEC_ID_QPEG, MKTAG('Q', '1', '.', '0') },
+    { AV_CODEC_ID_QPEG, MKTAG('Q', '1', '.', '1') },
+    { AV_CODEC_ID_WMV3, MKTAG('W', 'M', 'V', '3') },
+    { AV_CODEC_ID_LOCO, MKTAG('L', 'O', 'C', 'O') },
+    { AV_CODEC_ID_THEORA, MKTAG('t', 'h', 'e', 'o') },
 #if LIBAVCODEC_VERSION_INT>0x000409
-    { CODEC_ID_WNV1, MKTAG('W', 'N', 'V', '1') },
-    { CODEC_ID_AASC, MKTAG('A', 'A', 'S', 'C') },
-    { CODEC_ID_INDEO2, MKTAG('R', 'T', '2', '1') },
-    { CODEC_ID_FRAPS, MKTAG('F', 'P', 'S', '1') },
-    { CODEC_ID_TRUEMOTION2, MKTAG('T', 'M', '2', '0') },
+    { AV_CODEC_ID_WNV1, MKTAG('W', 'N', 'V', '1') },
+    { AV_CODEC_ID_AASC, MKTAG('A', 'A', 'S', 'C') },
+    { AV_CODEC_ID_INDEO2, MKTAG('R', 'T', '2', '1') },
+    { AV_CODEC_ID_FRAPS, MKTAG('F', 'P', 'S', '1') },
+    { AV_CODEC_ID_TRUEMOTION2, MKTAG('T', 'M', '2', '0') },
 #endif
 #if LIBAVCODEC_VERSION_INT>((50<<16)+(1<<8)+0)
-    { CODEC_ID_FLASHSV, MKTAG('F', 'S', 'V', '1') },
-    { CODEC_ID_JPEGLS,MKTAG('M', 'J', 'L', 'S') }, /* JPEG-LS custom FOURCC for avi - encoder */
-    { CODEC_ID_VC1, MKTAG('W', 'V', 'C', '1') },
-    { CODEC_ID_VC1, MKTAG('W', 'M', 'V', 'A') },
-    { CODEC_ID_CSCD, MKTAG('C', 'S', 'C', 'D') },
-    { CODEC_ID_ZMBV, MKTAG('Z', 'M', 'B', 'V') },
-    { CODEC_ID_KMVC, MKTAG('K', 'M', 'V', 'C') },
+    { AV_CODEC_ID_FLASHSV, MKTAG('F', 'S', 'V', '1') },
+    { AV_CODEC_ID_JPEGLS,MKTAG('M', 'J', 'L', 'S') }, /* JPEG-LS custom FOURCC for avi - encoder */
+    { AV_CODEC_ID_VC1, MKTAG('W', 'V', 'C', '1') },
+    { AV_CODEC_ID_VC1, MKTAG('W', 'M', 'V', 'A') },
+    { AV_CODEC_ID_CSCD, MKTAG('C', 'S', 'C', 'D') },
+    { AV_CODEC_ID_ZMBV, MKTAG('Z', 'M', 'B', 'V') },
+    { AV_CODEC_ID_KMVC, MKTAG('K', 'M', 'V', 'C') },
 #endif
 #if LIBAVCODEC_VERSION_INT>((51<<16)+(11<<8)+0)
-    { CODEC_ID_VP5, MKTAG('V', 'P', '5', '0') },
-    { CODEC_ID_VP6, MKTAG('V', 'P', '6', '0') },
-    { CODEC_ID_VP6, MKTAG('V', 'P', '6', '1') },
-    { CODEC_ID_VP6, MKTAG('V', 'P', '6', '2') },
-    { CODEC_ID_VP6F, MKTAG('V', 'P', '6', 'F') },
-    { CODEC_ID_JPEG2000, MKTAG('M', 'J', '2', 'C') },
-    { CODEC_ID_VMNC, MKTAG('V', 'M', 'n', 'c') },
+    { AV_CODEC_ID_VP5, MKTAG('V', 'P', '5', '0') },
+    { AV_CODEC_ID_VP6, MKTAG('V', 'P', '6', '0') },
+    { AV_CODEC_ID_VP6, MKTAG('V', 'P', '6', '1') },
+    { AV_CODEC_ID_VP6, MKTAG('V', 'P', '6', '2') },
+    { AV_CODEC_ID_VP6F, MKTAG('V', 'P', '6', 'F') },
+    { AV_CODEC_ID_JPEG2000, MKTAG('M', 'J', '2', 'C') },
+    { AV_CODEC_ID_VMNC, MKTAG('V', 'M', 'n', 'c') },
 #endif
 #if LIBAVCODEC_VERSION_INT>=((51<<16)+(49<<8)+0)
 // this tag seems not to exist in older versions of FFMPEG
-    { CODEC_ID_TARGA, MKTAG('t', 'g', 'a', ' ') },
+    { AV_CODEC_ID_TARGA, MKTAG('t', 'g', 'a', ' ') },
 #endif
-    { CODEC_ID_NONE, 0 },
+    { AV_CODEC_ID_NONE, 0 },
 };
diff --git a/modules/videoio/test/test_ffmpeg.cpp b/modules/videoio/test/test_ffmpeg.cpp
index 2f95cb21d..353ca19de 100644
--- a/modules/videoio/test/test_ffmpeg.cpp
+++ b/modules/videoio/test/test_ffmpeg.cpp
@@ -132,6 +132,7 @@ public:
                         writer << img;
                     }
 
+                    writer.release();
                     if (!created) created = true;
                     else remove(filename.c_str());
                 }

From 30912b49f3514895efbc597da89e6421bf08e60b Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 23 Apr 2015 00:39:44 +0300
Subject: [PATCH 43/48] fixed compile warnings; hopefully fixed test failures

---
 modules/videoio/src/cap_ffmpeg_impl.hpp | 27 +++++++++++++++++--------
 modules/videoio/src/ffmpeg_codecs.hpp   |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 06abd0734..f49301a8a 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -1001,6 +1001,7 @@ struct CvVideoWriter_FFMPEG
     int               input_pix_fmt;
     Image_FFMPEG      temp_image;
     int               frame_width, frame_height;
+    int               frame_idx;
     bool              ok;
     struct SwsContext *img_convert_ctx;
 };
@@ -1078,6 +1079,7 @@ void CvVideoWriter_FFMPEG::init()
     memset(&temp_image, 0, sizeof(temp_image));
     img_convert_ctx = 0;
     frame_width = frame_height = 0;
+    frame_idx = 0;
     ok = false;
 }
 
@@ -1250,7 +1252,13 @@ static AVStream *icv_add_video_stream_FFMPEG(AVFormatContext *oc,
 
 static const int OPENCV_NO_FRAMES_WRITTEN_CODE = 1000;
 
-static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st, uint8_t * outbuf, uint32_t outbuf_size, AVFrame * picture )
+static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0)
+                                      uint8_t *, uint32_t,
+#else
+                                      uint8_t * outbuf, uint32_t outbuf_size,
+#endif
+                                      AVFrame * picture )
 {
 #if LIBAVFORMAT_BUILD > 4628
     AVCodecContext * c = video_st->codec;
@@ -1285,14 +1293,14 @@ static int icv_av_write_frame_FFMPEG( AVFormatContext * oc, AVStream * video_st,
         pkt.size = 0;
         ret = avcodec_encode_video2(c, &pkt, picture, &got_output);
         if (ret < 0)
-            got_output = 0;
+            ;
         else if (got_output) {
-            //if (c->coded_frame->pts != (int64_t)AV_NOPTS_VALUE)
-            //    pkt.pts = av_rescale_q(c->coded_frame->pts, c->time_base, video_st->time_base);
-            //if (c->coded_frame->dts != (int64_t)AV_NOPTS_VALUE)
-            //    pkt.dts = av_rescale_q(c->coded_frame->dts, c->time_base, video_st->time_base);
-            //if (pkt.duration)
-            //    pkt.duration = av_rescale_q(pkt.duration, c->time_base, video_st->time_base);
+            if (pkt.pts != (int64_t)AV_NOPTS_VALUE)
+                pkt.pts = av_rescale_q(pkt.pts, c->time_base, video_st->time_base);
+            if (pkt.dts != (int64_t)AV_NOPTS_VALUE)
+                pkt.dts = av_rescale_q(pkt.dts, c->time_base, video_st->time_base);
+            if (pkt.duration)
+                pkt.duration = av_rescale_q(pkt.duration, c->time_base, video_st->time_base);
             pkt.stream_index= video_st->index;
             ret = av_write_frame(oc, &pkt);
             av_free_packet(&pkt);
@@ -1425,7 +1433,9 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int
                        (PixelFormat)input_pix_fmt, width, height);
     }
 
+    picture->pts = frame_idx;
     ret = icv_av_write_frame_FFMPEG( oc, video_st, outbuf, outbuf_size, picture) >= 0;
+    frame_idx++;
 
     return ret;
 }
@@ -1737,6 +1747,7 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
     }
     frame_width = width;
     frame_height = height;
+    frame_idx = 0;
     ok = true;
 
     return true;
diff --git a/modules/videoio/src/ffmpeg_codecs.hpp b/modules/videoio/src/ffmpeg_codecs.hpp
index aa3e3bfc6..e8c661aaa 100644
--- a/modules/videoio/src/ffmpeg_codecs.hpp
+++ b/modules/videoio/src/ffmpeg_codecs.hpp
@@ -98,7 +98,7 @@ typedef struct AVCodecTag {
 #define AV_CODEC_ID_H264 CODEC_ID_H264
 #define AV_CODEC_ID_H263 CODEC_ID_H263
 #define AV_CODEC_ID_H263P CODEC_ID_H263P
-#define AV_CODEC_ID_H263 CODEC_ID_H261
+#define AV_CODEC_ID_H261 CODEC_ID_H261
 #define AV_CODEC_ID_MPEG4 CODEC_ID_MPEG4
 #define AV_CODEC_ID_MSMPEG4V3 CODEC_ID_MSMPEG4V3
 #define AV_CODEC_ID_MSMPEG4V2 CODEC_ID_MSMPEG4V2

From d825b7ee9a24cf8ff8aee76d286f060ee7582488 Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Thu, 23 Apr 2015 14:40:44 +0300
Subject: [PATCH 44/48] fixed mjpeg encoder so that libjpeg-based decoder does
 not complain on extra bytes in the end of stream

---
 modules/videoio/src/cap.cpp               |  4 ++--
 modules/videoio/src/cap_mjpeg_encoder.cpp | 21 ++++++++++++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/modules/videoio/src/cap.cpp b/modules/videoio/src/cap.cpp
index 3a6d413a3..1fedb0b66 100644
--- a/modules/videoio/src/cap.cpp
+++ b/modules/videoio/src/cap.cpp
@@ -552,11 +552,11 @@ static Ptr<IVideoCapture> IVideoCapture_create(int index)
 }
 
 
-static Ptr<IVideoCapture> IVideoCapture_create(const String&)
+static Ptr<IVideoCapture> IVideoCapture_create(const String& filename)
 {
     Ptr<IVideoCapture> capture;
 
-    //capture = createMotionJpegCapture(filename);
+    capture = createMotionJpegCapture(filename);
     if (capture && capture->isOpened())
     {
         return capture;
diff --git a/modules/videoio/src/cap_mjpeg_encoder.cpp b/modules/videoio/src/cap_mjpeg_encoder.cpp
index 7856fd416..2a8e0b964 100644
--- a/modules/videoio/src/cap_mjpeg_encoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_encoder.cpp
@@ -248,6 +248,25 @@ public:
             writeBlock();
     }
 
+    void jflush(unsigned currval, int bitIdx)
+    {
+        uchar v;
+        uchar* ptr = m_current;
+        currval |= (1 << bitIdx)-1;
+        while( bitIdx < 32 )
+        {
+            v = (uchar)(currval >> 24);
+            *ptr++ = v;
+            if( v == 255 )
+                *ptr++ = 0;
+            currval <<= 8;
+            bitIdx += 8;
+        }
+        m_current = ptr;
+        if( m_current >= m_end )
+            writeBlock();
+    }
+
     static bool createEncodeHuffmanTable( const int* src, unsigned* table, int max_size )
     {
         int  i, k;
@@ -1440,7 +1459,7 @@ void MotionJpegWriter::writeFrameData( const uchar* data, int step, int colorspa
     }
 
     // Flush
-    JPUT_BITS((unsigned)-1, bit_idx & 31);
+    strm.jflush(currval, bit_idx);
     strm.jputShort( 0xFFD9 ); // EOI marker
     /*printf("total dct = %.1fms, total cvt = %.1fms\n",
      total_dct*1000./cv::getTickFrequency(),

From 34bc3b859530baa5dddb6e6b2c47980bfd8dfbbe Mon Sep 17 00:00:00 2001
From: Deanna Hood <deanna.m.hood@gmail.com>
Date: Thu, 23 Apr 2015 10:08:42 -0400
Subject: [PATCH 45/48] Don't relax error level for particular fundamental
 matrix calculation methods

---
 modules/calib3d/test/test_fundam.cpp | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/modules/calib3d/test/test_fundam.cpp b/modules/calib3d/test/test_fundam.cpp
index 7eb12ad24..5f8d30de4 100644
--- a/modules/calib3d/test/test_fundam.cpp
+++ b/modules/calib3d/test/test_fundam.cpp
@@ -973,26 +973,12 @@ int CV_FundamentalMatTest::prepare_test_case( int test_case_idx )
     return code;
 }
 
-
 void CV_FundamentalMatTest::run_func()
 {
-    //if(!test_cpp)
-    {
-        CvMat _input0 = test_mat[INPUT][0], _input1 = test_mat[INPUT][1];
-        CvMat F = test_mat[TEMP][0], mask = test_mat[TEMP][1];
-        f_result = cvFindFundamentalMat( &_input0, &_input1, &F, method, MAX(sigma*3, 0.01), 0, &mask );
-    }
-    /*else
-    {
-        cv::findFundamentalMat(const Mat& points1, const Mat& points2,
-        vector<uchar>& mask, int method=FM_RANSAC,
-        double param1=3., double param2=0.99 );
-
-        CV_EXPORTS Mat findFundamentalMat( const Mat& points1, const Mat& points2,
-                                          int method=FM_RANSAC,
-                                          double param1=3., double param2=0.99 );
-    }*/
-
+    // cvFindFundamentalMat calls cv::findFundamentalMat
+    CvMat _input0 = test_mat[INPUT][0], _input1 = test_mat[INPUT][1];
+    CvMat F = test_mat[TEMP][0], mask = test_mat[TEMP][1];
+    f_result = cvFindFundamentalMat( &_input0, &_input1, &F, method, MAX(sigma*3, 0.01), 0, &mask );
 }
 
 
@@ -1022,7 +1008,7 @@ void CV_FundamentalMatTest::prepare_to_validation( int test_case_idx )
     F0 *= 1./f0[8];
 
     uchar* status = test_mat[TEMP][1].ptr();
-    double err_level = method <= CV_FM_8POINT ? 1 : get_success_error_level( test_case_idx, OUTPUT, 1 );
+    double err_level = get_success_error_level( test_case_idx, OUTPUT, 1 );
     uchar* mtfm1 = test_mat[REF_OUTPUT][1].ptr();
     uchar* mtfm2 = test_mat[OUTPUT][1].ptr();
     double* f_prop1 = test_mat[REF_OUTPUT][0].ptr<double>();

From eee210f3b502ec46b9c6e7a7a2df5a8f8b392b9a Mon Sep 17 00:00:00 2001
From: Deanna Hood <deanna.m.hood@gmail.com>
Date: Thu, 23 Apr 2015 10:09:48 -0400
Subject: [PATCH 46/48] Fix Bug #3441, #4072, #4173: 8-point fundamental matrix
 calculation error

---
 modules/calib3d/src/fundam.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/calib3d/src/fundam.cpp b/modules/calib3d/src/fundam.cpp
index a97ed2c70..230182e8c 100644
--- a/modules/calib3d/src/fundam.cpp
+++ b/modules/calib3d/src/fundam.cpp
@@ -641,7 +641,7 @@ static int run8Point( const Mat& _m1, const Mat& _m2, Mat& _fmatrix )
     W.at<double>(2) = 0.;
 
     // F0 <- U*diag([W(1), W(2), 0])*V'
-    gemm( U, Mat::diag(W), 1., 0, 0., TF, GEMM_1_T );
+    gemm( U, Mat::diag(W), 1., 0, 0., TF, 0 );
     gemm( TF, V, 1., 0, 0., F0, 0/*CV_GEMM_B_T*/ );
 
     // apply the transformation that is inverse

From b057a62c8b565163931cfa45ce807bcb252add79 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@itseez.com>
Date: Thu, 23 Apr 2015 17:29:40 +0300
Subject: [PATCH 47/48] Set status to rc

---
 modules/core/include/opencv2/core/version.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/core/include/opencv2/core/version.hpp b/modules/core/include/opencv2/core/version.hpp
index 4a46f7930..a4763c765 100644
--- a/modules/core/include/opencv2/core/version.hpp
+++ b/modules/core/include/opencv2/core/version.hpp
@@ -53,7 +53,7 @@
 #define CV_VERSION_MAJOR    3
 #define CV_VERSION_MINOR    0
 #define CV_VERSION_REVISION 0
-#define CV_VERSION_STATUS   "-dev"
+#define CV_VERSION_STATUS   "-rc1"
 
 #define CVAUX_STR_EXP(__A)  #__A
 #define CVAUX_STR(__A)      CVAUX_STR_EXP(__A)

From f57063efa211161cafba3a698513a56074b775e2 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Thu, 23 Apr 2015 17:57:29 +0300
Subject: [PATCH 48/48] port of aarch64 cuda changes from 2.4

---
 cmake/FindCUDA.cmake | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake
index ceaed5e3a..5efd36c4e 100644
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@@ -619,6 +619,8 @@ if(DEFINED CUDA_TARGET_CPU_ARCH)
   set(_cuda_target_cpu_arch_initial "${CUDA_TARGET_CPU_ARCH}")
 elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|ARM)")
   set(_cuda_target_cpu_arch_initial "ARM")
+elseif(CUDA_VERSION VERSION_GREATER "6.5" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64)")
+  set(_cuda_target_cpu_arch_initial "AARCH64")
 else()
   set(_cuda_target_cpu_arch_initial "")
 endif()
@@ -643,6 +645,12 @@ elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND "${CUDA_T
   elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
     set(_cuda_target_triplet_initial "armv7-linux-gnueabihf")
   endif()
+elseif(CUDA_VERSION VERSION_GREATER "6.5" AND CMAKE_CROSSCOMPILING AND "${CUDA_TARGET_CPU_ARCH}" STREQUAL "AARCH64")
+  if("${CUDA_TARGET_OS_VARIANT}" STREQUAL "Android" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux-androideabi")
+    set(_cuda_target_triplet_initial "aarch64-linux-androideabi")
+  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux-gnueabihf")
+    set(_cuda_target_triplet_initial "aarch64-linux-gnueabihf")
+  endif()
 endif()
 set(CUDA_TARGET_TRIPLET "${_cuda_target_triplet_initial}" CACHE STRING "Specify the target triplet for which the input files must be compiled.")
 file(GLOB __cuda_available_target_tiplets RELATIVE "${CUDA_TOOLKIT_ROOT_DIR}/targets" "${CUDA_TOOLKIT_ROOT_DIR}/targets/*" )
@@ -1094,8 +1102,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
     set(nvcc_flags ${nvcc_flags} -m32)
   endif()
 
-  if(CUDA_TARGET_CPU_ARCH)
-    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
+  if(CUDA_TARGET_CPU_ARCH AND CUDA_VERSION VERSION_LESS "7.0")
+    # CPU architecture is either ARM or X86. Patch AARCH64 to be ARM
+    string(REPLACE "AARCH64" "ARM" CUDA_TARGET_CPU_ARCH_patched ${CUDA_TARGET_CPU_ARCH})
+    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH_patched}")
   endif()
 
   if(CUDA_TARGET_OS_VARIANT AND CUDA_VERSION VERSION_LESS "7.0")