Merge branch 'brisk' of https://github.com/cbalint13/opencv into brisk

2015-05-06 11:23:50 +03:00 · 2015-05-06 11:23:50 +03:00 · 6b1d5e48b6
commit 6b1d5e48b6
parent fb56d5c482 2c6114f58d
183 changed files with 9868 additions and 3748 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,7 +9,6 @@ Thumbs.db
 tags
 tegra/
 bin/
-CMakeFiles/
 *.sdf
 *.opensdf
 *.obj
@ -17,3 +16,9 @@ CMakeFiles/
 *.depend
 *.rule
 *.tmp
+*/debug
+*/CMakeFiles
+CMakeCache.txt
+*.suo
+*.log
+*.tlog
--- a/3rdparty/ffmpeg/build_win32.txt
+++ b/3rdparty/ffmpeg/build_win32.txt
@ -0,0 +1,42 @@
+The build script is to be fixed.
+Right now it assumes that 32-bit MinGW is in the system path and
+64-bit mingw is installed to c:\Apps\MinGW64.
+
+It is important that gcc is used, not g++!
+Otherwise the produced DLL will likely be dependent on libgcc_s_dw2-1.dll or similar DLL.
+While we want to make the DLLs with minimum dependencies: Win32 libraries + msvcrt.dll.
+
+ffopencv.c is really a C++ source, hence -x c++ is used.
+
+How to update opencv_ffmpeg.dll and opencv_ffmpeg_64.dll when a new version of FFMPEG is release?
+
+1. Install 32-bit MinGW + MSYS from
+   http://sourceforge.net/projects/mingw/files/Automated%20MinGW%20Installer/mingw-get-inst/
+   Let's assume, it's installed in C:\MSYS32.
+2. Install 64-bit MinGW. http://mingw-w64.sourceforge.net/
+   Let's assume, it's installed in C:\MSYS64
+3. Copy C:\MSYS32\msys to C:\MSYS64\msys. Edit C:\MSYS64\msys\etc\fstab, change C:\MSYS32 to C:\MSYS64.
+
+4. Now you have working MSYS32 and MSYS64 environments.
+   Launch, one by one, C:\MSYS32\msys\msys.bat and C:\MSYS64\msys\msys.bat to create your home directories.
+
+4. Download ffmpeg-x.y.z.tar.gz (where x.y.z denotes the actual ffmpeg version).
+   Copy it to C:\MSYS{32|64}\msys\home\<loginname> directory.
+
+5. To build 32-bit ffmpeg libraries, run C:\MSYS32\msys\msys.bat and type the following commands:
+
+   5.1. tar -xzf ffmpeg-x.y.z.tar.gz
+   5.2. mkdir build
+   5.3. cd build
+   5.4. ../ffmpeg-x.y.z/configure --enable-w32threads
+   5.5. make
+   5.6. make install
+   5.7. cd /local/lib
+   5.8. strip -g *.a
+
+6. Then repeat the same for 64-bit case. The output libs: libavcodec.a etc. need to be renamed to libavcodec64.a etc.
+
+7. Then, copy all those libs to <opencv>\3rdparty\lib\, copy the headers to <opencv>\3rdparty\include\ffmpeg_.
+
+8. Then, go to <opencv>\3rdparty\ffmpeg, edit make.bat
+   (change paths to the actual paths to your msys32 and msys64 distributions) and then run make.bat
--- a/3rdparty/ffmpeg/ffmpeg_version.cmake
+++ b/3rdparty/ffmpeg/ffmpeg_version.cmake
@ -3,9 +3,11 @@ set(HAVE_FFMPEG_CODEC 1)
 set(HAVE_FFMPEG_FORMAT 1)
 set(HAVE_FFMPEG_UTIL 1)
 set(HAVE_FFMPEG_SWSCALE 1)
+set(HAVE_FFMPEG_RESAMPLE 0)
 set(HAVE_GENTOO_FFMPEG 1)

 set(ALIASOF_libavcodec_VERSION 55.18.102)
 set(ALIASOF_libavformat_VERSION 55.12.100)
 set(ALIASOF_libavutil_VERSION 52.38.100)
 set(ALIASOF_libswscale_VERSION 2.3.100)
+set(ALIASOF_libavresample_VERSION 1.0.1)
--- a/3rdparty/ffmpeg/license.txt
+++ b/3rdparty/ffmpeg/license.txt
@ -0,0 +1,520 @@
+              Copyright (C) 2001 Fabrice Bellard
+
+    FFmpeg is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    FFmpeg is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with FFmpeg; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+==================================================================================
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
--- a/3rdparty/ffmpeg/readme.txt
+++ b/3rdparty/ffmpeg/readme.txt
@ -1,42 +1,32 @@
-The build script is to be fixed.
-Right now it assumes that 32-bit MinGW is in the system path and
-64-bit mingw is installed to c:\Apps\MinGW64.
+* On Linux and other Unix flavors OpenCV uses default or user-built ffmpeg/libav libraries.
+  If user builds ffmpeg/libav from source and wants OpenCV to stay BSD library, not GPL/LGPL,
+  he/she should use --enabled-shared configure flag and make sure that no GPL components are
+  enabled (some notable examples are x264 (H264 encoder) and libac3 (Dolby AC3 audio codec)).
+  See https://www.ffmpeg.org/legal.html for details.
+  
+  If you want to play very safe and do not want to use FFMPEG at all, regardless of whether it's installed on
+  your system or not, configure and build OpenCV using CMake with WITH_FFMPEG=OFF flag. OpenCV will then use
+  AVFoundation (OSX), GStreamer (Linux) or other available backends supported by opencv_videoio module.
+  
+  There is also our self-contained motion jpeg codec, which you can use without any worries.
+  It handles CV_FOURCC('M', 'J', 'P', 'G') streams within an AVI container (".avi").
+  
+* On Windows OpenCV uses pre-built ffmpeg binaries, built with proper flags (without GPL components) and
+  wrapped with simple, stable OpenCV-compatible API.
+  The binaries are opencv_ffmpeg.dll (version for 32-bit Windows) and
+  opencv_ffmpeg_64.dll (version for 64-bit Windows).
+  
+  See build_win32.txt for the build instructions, if you want to rebuild opencv_ffmpeg*.dll from scratch.

-It is important that gcc is used, not g++!
-Otherwise the produced DLL will likely be dependent on libgcc_s_dw2-1.dll or similar DLL.
-While we want to make the DLLs with minimum dependencies: Win32 libraries + msvcrt.dll.
+  The pre-built opencv_ffmpeg*.dll is:
+  * LGPL library, not BSD libraries.
+  * Loaded at runtime by opencv_videoio module.
+    If it succeeds, ffmpeg can be used to decode/encode videos;
+    otherwise, other API is used.

-ffopencv.c is really a C++ source, hence -x c++ is used.
-
-How to update opencv_ffmpeg.dll and opencv_ffmpeg_64.dll when a new version of FFMPEG is release?
-
-1. Install 32-bit MinGW + MSYS from
-   http://sourceforge.net/projects/mingw/files/Automated%20MinGW%20Installer/mingw-get-inst/
-   Let's assume, it's installed in C:\MSYS32.
-2. Install 64-bit MinGW. http://mingw-w64.sourceforge.net/
-   Let's assume, it's installed in C:\MSYS64
-3. Copy C:\MSYS32\msys to C:\MSYS64\msys. Edit C:\MSYS64\msys\etc\fstab, change C:\MSYS32 to C:\MSYS64.
-
-4. Now you have working MSYS32 and MSYS64 environments.
-   Launch, one by one, C:\MSYS32\msys\msys.bat and C:\MSYS64\msys\msys.bat to create your home directories.
-
-4. Download ffmpeg-x.y.z.tar.gz (where x.y.z denotes the actual ffmpeg version).
-   Copy it to C:\MSYS{32|64}\msys\home\<loginname> directory.
-
-5. To build 32-bit ffmpeg libraries, run C:\MSYS32\msys\msys.bat and type the following commands:
-
-   5.1. tar -xzf ffmpeg-x.y.z.tar.gz
-   5.2. mkdir build
-   5.3. cd build
-   5.4. ../ffmpeg-x.y.z/configure --enable-w32threads
-   5.5. make
-   5.6. make install
-   5.7. cd /local/lib
-   5.8. strip -g *.a
-
-6. Then repeat the same for 64-bit case. The output libs: libavcodec.a etc. need to be renamed to libavcodec64.a etc.
-
-7. Then, copy all those libs to <opencv>\3rdparty\lib\, copy the headers to <opencv>\3rdparty\include\ffmpeg_.
-
-8. Then, go to <opencv>\3rdparty\ffmpeg, edit make.bat
-   (change paths to the actual paths to your msys32 and msys64 distributions) and then run make.bat
+  If LGPL/GPL software can not be supplied with your OpenCV-based product, simply exclude
+  opencv_ffmpeg*.dll from your distribution; OpenCV will stay fully functional except for the ability to
+  decode/encode videos using FFMPEG (though, it may still be able to do that using other API,
+  such as Video for Windows, Windows Media Foundation or our self-contained motion jpeg codec).
+  
+  See license.txt for the FFMPEG copyright notice and the licensing terms.
--- a/3rdparty/libjasper/jas_cm.c
+++ b/3rdparty/libjasper/jas_cm.c
@ -842,7 +842,6 @@ static int jas_cmshapmat_apply(jas_cmpxform_t *pxform, jas_cmreal_t *in,
                *dst++ = a2;
            }
        } else {
-assert(0);
            while (--cnt >= 0) {
                a0 = *src++;
                src++;
--- a/3rdparty/libjasper/jas_stream.c
+++ b/3rdparty/libjasper/jas_stream.c
@ -345,6 +345,7 @@ jas_stream_t *jas_stream_tmpfile()
 {
    jas_stream_t *stream;
    jas_stream_fileobj_t *obj;
+    char *tmpname;

    if (!(stream = jas_stream_create())) {
        return 0;
@ -365,10 +366,12 @@ jas_stream_t *jas_stream_tmpfile()

 #ifdef _WIN32
    /* Choose a file name. */
-    tmpnam(obj->pathname);
+    tmpname = tempnam(NULL, NULL);
+    strcpy(obj->pathname, tmpname);
+    free(tmpname);

    /* Open the underlying file. */
-    if ((obj->fd = open(obj->pathname, O_CREAT | O_EXCL | O_RDWR | O_TRUNC | O_BINARY,
+    if ((obj->fd = open(obj->pathname, O_CREAT | O_EXCL | O_RDWR | O_TRUNC | O_BINARY | O_TEMPORARY | _O_SHORT_LIVED,
      JAS_STREAM_PERMS)) < 0) {
        jas_stream_destroy(stream);
        return 0;
--- a/3rdparty/libjpeg/CMakeLists.txt
+++ b/3rdparty/libjpeg/CMakeLists.txt
@ -15,6 +15,13 @@ else()
  ocv_list_filterout(lib_srcs jmemnobs.c)
 endif()

+if(WINRT)
+    add_definitions(-DNO_GETENV)
+    get_directory_property( DirDefs COMPILE_DEFINITIONS )
+    message(STATUS "Adding NO_GETENV to compiler definitions for WINRT:")
+    message(STATUS "   COMPILE_DEFINITIONS = ${DirDefs}")
+endif()
+
 # ----------------------------------------------------------------------------------
 #         Define the library target:
 # ----------------------------------------------------------------------------------
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@ -17,7 +17,7 @@ check_include_file(string.h HAVE_STRING_H)
 check_include_file(sys/types.h HAVE_SYS_TYPES_H)
 check_include_file(unistd.h HAVE_UNISTD_H)

-if(WIN32)
+if(WIN32 AND NOT WINRT)
  set(USE_WIN32_FILEIO 1)
 endif()

@ -79,7 +79,7 @@ set(lib_srcs
    "${CMAKE_CURRENT_BINARY_DIR}/tif_config.h"
    )

-if(WIN32)
+if(WIN32 AND NOT WINRT)
  list(APPEND lib_srcs tif_win32.c)
 else()
  list(APPEND lib_srcs tif_unix.c)
--- a/3rdparty/readme.txt
+++ b/3rdparty/readme.txt
@ -6,41 +6,34 @@ In order to use these versions of libraries instead of system ones on UNIX syste
 should use BUILD_<library_name> CMake flags (for example, BUILD_PNG for the libpng library).

 ------------------------------------------------------------------------------------
-libjpeg 8d (8.4)  -   The Independent JPEG Group's JPEG software.
+libjpeg               The Independent JPEG Group's JPEG software.
                      Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
                      See IGJ home page http://www.ijg.org
                      for details and links to the source code

-                      HAVE_JPEG preprocessor flag must be set to make imgcodecs use libjpeg.
-                      On UNIX systems configure script takes care of it.
+                      WITH_JPEG CMake option must be ON to add libjpeg support to imgcodecs.
 ------------------------------------------------------------------------------------
-libpng 1.5.12     -   Portable Network Graphics library.
+libpng                Portable Network Graphics library.
                      Copyright (c) 2004, 2006-2012 Glenn Randers-Pehrson.
                      See libpng home page http://www.libpng.org
                      for details and links to the source code

-                      HAVE_PNG preprocessor flag must be set to make imgcodecs use libpng.
-                      On UNIX systems configure script takes care of it.
+                      WITH_PNG CMake option must be ON to add libpng support to imgcodecs.
 ------------------------------------------------------------------------------------
-libtiff 4.0.2     -   Tag Image File Format (TIFF) Software
+libtiff               Tag Image File Format (TIFF) Software
                      Copyright (c) 1988-1997 Sam Leffler
                      Copyright (c) 1991-1997 Silicon Graphics, Inc.
                      See libtiff home page http://www.remotesensing.org/libtiff/
                      for details and links to the source code

-                      HAVE_TIFF preprocessor flag must be set to make imgcodecs use libtiff.
-                      On UNIX systems configure script takes care of it.
-                      In this build support for ZIP (LZ77 compression) is turned on.
+                      WITH_TIFF CMake option must be ON to add libtiff & zlib support to imgcodecs.
 ------------------------------------------------------------------------------------
-zlib 1.2.7        -   General purpose LZ77 compression library
+zlib                  General purpose LZ77 compression library
                      Copyright (C) 1995-2012 Jean-loup Gailly and Mark Adler.
                      See zlib home page http://www.zlib.net
                      for details and links to the source code
-
-                      No preprocessor definition is needed to make imgcodecs use this library -
-                      it is included automatically if either libpng or libtiff are used.
 ------------------------------------------------------------------------------------
-jasper-1.900.1    -   JasPer is a collection of software
+jasper                JasPer is a collection of software
                      (i.e., a library and application programs) for the coding
                      and manipulation of images.  This software can handle image data in a
                      variety of formats.  One such format supported by JasPer is the JPEG-2000
@ -50,14 +43,9 @@ jasper-1.900.1    -   JasPer is a collection of software
                      Copyright (c) 1999-2000 The University of British Columbia
                      Copyright (c) 2001-2003 Michael David Adams

-                      The JasPer license can be found in src/libjasper.
-
-                      OpenCV on Windows uses pre-built libjasper library
-                      (lib/libjasper*). To get the latest source code,
-                      please, visit the project homepage:
-                      http://www.ece.uvic.ca/~mdadams/jasper/
+                      The JasPer license can be found in libjasper.
 ------------------------------------------------------------------------------------
-openexr-1.7.1     -   OpenEXR is a high dynamic-range (HDR) image file format developed
+openexr               OpenEXR is a high dynamic-range (HDR) image file format developed
                      by Industrial Light & Magic for use in computer imaging applications.

                      Copyright (c) 2006, Industrial Light & Magic, a division of Lucasfilm
@ -66,11 +54,17 @@ openexr-1.7.1     -   OpenEXR is a high dynamic-range (HDR) image file format de

                      The project homepage: http://www.openexr.com
 ------------------------------------------------------------------------------------
-ffmpeg-0.8.0      -   FFmpeg is a complete, cross-platform solution to record,
+ffmpeg                FFmpeg is a complete, cross-platform solution to record,
                      convert and stream audio and video. It includes libavcodec -
                      the leading audio/video codec library, and also libavformat, libavutils and
-                      other helper libraries that are used by OpenCV (in highgui module) to
+                      other helper libraries that are used by OpenCV (in videoio module) to
                      read and write video files.

-                      The project homepage: http://ffmpeg.org/
+                      Copyright (c) 2001 Fabrice Bellard
+
+                      The project homepage: http://ffmpeg.org/.
+                      
+                      * On Linux/OSX we link user-installed ffmpeg (or ffmpeg fork libav).
+                      * On Windows we use pre-built ffmpeg binaries,
+                        see opencv/3rdparty/ffmpeg/readme.txt for details and licensing information
 ------------------------------------------------------------------------------------
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -56,6 +56,11 @@ if(POLICY CMP0026)
  cmake_policy(SET CMP0026 OLD)
 endif()

+if (POLICY CMP0042)
+  # silence cmake 3.0+ warnings about MACOSX_RPATH
+  cmake_policy(SET CMP0042 OLD)
+endif()
+
 # must go before the project command
 set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE)
 if(DEFINED CMAKE_BUILD_TYPE)
@ -926,6 +931,7 @@ if(DEFINED WITH_FFMPEG)
  status("      format:"       HAVE_FFMPEG_FORMAT  THEN "YES (ver ${ALIASOF_libavformat_VERSION})" ELSE NO)
  status("      util:"         HAVE_FFMPEG_UTIL    THEN "YES (ver ${ALIASOF_libavutil_VERSION})"   ELSE NO)
  status("      swscale:"      HAVE_FFMPEG_SWSCALE THEN "YES (ver ${ALIASOF_libswscale_VERSION})"  ELSE NO)
+  status("      resample:"     HAVE_FFMPEG_RESAMPLE THEN "YES (ver ${ALIASOF_libavresample_VERSION})"  ELSE NO)
  status("      gentoo-style:" HAVE_GENTOO_FFMPEG  THEN YES                                        ELSE NO)
 endif(DEFINED WITH_FFMPEG)

--- a/8
+++ b/8
@ -7,6 +7,14 @@ copy or use the software.
               For Open Source Computer Vision Library
                       (3-clause BSD License)

+Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
+Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+Copyright (C) 2009-2015, NVIDIA Corporation, all rights reserved.
+Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+Copyright (C) 2015, OpenCV Foundation, all rights reserved.
+Copyright (C) 2015, Itseez Inc., all rights reserved.
+Third party copyrights are property of their respective owners.
+
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:

--- a/apps/annotation/CMakeLists.txt
+++ b/apps/annotation/CMakeLists.txt
@ -9,7 +9,7 @@ project(annotation)
 set(the_target opencv_annotation)

 ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules(${the_target} ${OPENCV_ANNOTATION_DEPS})
+ocv_target_include_modules_recurse(${the_target} ${OPENCV_ANNOTATION_DEPS})

 file(GLOB SRCS *.cpp)

--- a/apps/createsamples/CMakeLists.txt
+++ b/apps/createsamples/CMakeLists.txt
@ -9,7 +9,7 @@ project(createsamples)
 set(the_target opencv_createsamples)

 ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules(${the_target} ${OPENCV_CREATESAMPLES_DEPS})
+ocv_target_include_modules_recurse(${the_target} ${OPENCV_CREATESAMPLES_DEPS})

 file(GLOB SRCS *.cpp)
 file(GLOB HDRS *.h*)
--- a/apps/traincascade/CMakeLists.txt
+++ b/apps/traincascade/CMakeLists.txt
@ -9,7 +9,7 @@ project(traincascade)
 set(the_target opencv_traincascade)

 ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
-ocv_target_include_modules(${the_target} ${OPENCV_TRAINCASCADE_DEPS})
+ocv_target_include_modules_recurse(${the_target} ${OPENCV_TRAINCASCADE_DEPS})

 file(GLOB SRCS *.cpp)
 file(GLOB HDRS *.h*)
--- a/apps/traincascade/boost.cpp
+++ b/apps/traincascade/boost.cpp
@ -437,7 +437,7 @@ CvDTreeNode* CvCascadeBoostTrainData::subsample_data( const CvMat* _subsample_id
            if (is_buf_16u)
            {
                unsigned short* udst_idx = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                    vi*sample_count + data_root->offset);
+                    (size_t)vi*sample_count + data_root->offset);
                for( int i = 0; i < num_valid; i++ )
                {
                    idx = src_idx[i];
@ -450,7 +450,7 @@ CvDTreeNode* CvCascadeBoostTrainData::subsample_data( const CvMat* _subsample_id
            else
            {
                int* idst_idx = buf->data.i + root->buf_idx*get_length_subbuf() +
-                    vi*sample_count + root->offset;
+                    (size_t)vi*sample_count + root->offset;
                for( int i = 0; i < num_valid; i++ )
                {
                    idx = src_idx[i];
@ -467,14 +467,14 @@ CvDTreeNode* CvCascadeBoostTrainData::subsample_data( const CvMat* _subsample_id
        if (is_buf_16u)
        {
            unsigned short* udst = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                (workVarCount-1)*sample_count + root->offset);
+                (size_t)(workVarCount-1)*sample_count + root->offset);
            for( int i = 0; i < count; i++ )
                udst[i] = (unsigned short)src_lbls[sidx[i]];
        }
        else
        {
            int* idst = buf->data.i + root->buf_idx*get_length_subbuf() +
-                (workVarCount-1)*sample_count + root->offset;
+                (size_t)(workVarCount-1)*sample_count + root->offset;
            for( int i = 0; i < count; i++ )
                idst[i] = src_lbls[sidx[i]];
        }
@ -484,14 +484,14 @@ CvDTreeNode* CvCascadeBoostTrainData::subsample_data( const CvMat* _subsample_id
        if (is_buf_16u)
        {
            unsigned short* sample_idx_dst = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                workVarCount*sample_count + root->offset);
+                (size_t)workVarCount*sample_count + root->offset);
            for( int i = 0; i < count; i++ )
                sample_idx_dst[i] = (unsigned short)sample_idx_src[sidx[i]];
        }
        else
        {
            int* sample_idx_dst = buf->data.i + root->buf_idx*get_length_subbuf() +
-                workVarCount*sample_count + root->offset;
+                (size_t)workVarCount*sample_count + root->offset;
            for( int i = 0; i < count; i++ )
                sample_idx_dst[i] = sample_idx_src[sidx[i]];
        }
@ -677,9 +677,9 @@ void CvCascadeBoostTrainData::setData( const CvFeatureEvaluator* _featureEvaluat

    // set sample labels
    if (is_buf_16u)
-        udst = (unsigned short*)(buf->data.s + work_var_count*sample_count);
+        udst = (unsigned short*)(buf->data.s + (size_t)work_var_count*sample_count);
    else
-        idst = buf->data.i + work_var_count*sample_count;
+        idst = buf->data.i + (size_t)work_var_count*sample_count;

    for (int si = 0; si < sample_count; si++)
    {
@ -747,11 +747,11 @@ void CvCascadeBoostTrainData::get_ord_var_data( CvDTreeNode* n, int vi, float* o
    if ( vi < numPrecalcIdx )
    {
        if( !is_buf_16u )
-            *sortedIndices = buf->data.i + n->buf_idx*get_length_subbuf() + vi*sample_count + n->offset;
+            *sortedIndices = buf->data.i + n->buf_idx*get_length_subbuf() + (size_t)vi*sample_count + n->offset;
        else
        {
            const unsigned short* shortIndices = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() +
-                                                    vi*sample_count + n->offset );
+                                                    (size_t)vi*sample_count + n->offset );
            for( int i = 0; i < nodeSampleCount; i++ )
                sortedIndicesBuf[i] = shortIndices[i];

@ -862,14 +862,14 @@ struct FeatureIdxOnlyPrecalc : ParallelLoopBody
            {
                valCachePtr[si] = (*featureEvaluator)( fi, si );
                if ( is_buf_16u )
-                    *(udst + fi*sample_count + si) = (unsigned short)si;
+                    *(udst + (size_t)fi*sample_count + si) = (unsigned short)si;
                else
-                    *(idst + fi*sample_count + si) = si;
+                    *(idst + (size_t)fi*sample_count + si) = si;
            }
            if ( is_buf_16u )
-                std::sort(udst + fi*sample_count, udst + (fi + 1)*sample_count, LessThanIdx<float, unsigned short>(valCachePtr) );
+                std::sort(udst + (size_t)fi*sample_count, udst + (size_t)(fi + 1)*sample_count, LessThanIdx<float, unsigned short>(valCachePtr) );
            else
-                std::sort(idst + fi*sample_count, idst + (fi + 1)*sample_count, LessThanIdx<float, int>(valCachePtr) );
+                std::sort(idst + (size_t)fi*sample_count, idst + (size_t)(fi + 1)*sample_count, LessThanIdx<float, int>(valCachePtr) );
        }
    }
    const CvFeatureEvaluator* featureEvaluator;
@ -898,14 +898,14 @@ struct FeatureValAndIdxPrecalc : ParallelLoopBody
            {
                valCache->at<float>(fi,si) = (*featureEvaluator)( fi, si );
                if ( is_buf_16u )
-                    *(udst + fi*sample_count + si) = (unsigned short)si;
+                    *(udst + (size_t)fi*sample_count + si) = (unsigned short)si;
                else
-                    *(idst + fi*sample_count + si) = si;
+                    *(idst + (size_t)fi*sample_count + si) = si;
            }
            if ( is_buf_16u )
-                std::sort(udst + fi*sample_count, udst + (fi + 1)*sample_count, LessThanIdx<float, unsigned short>(valCache->ptr<float>(fi)) );
+                std::sort(udst + (size_t)fi*sample_count, udst + (size_t)(fi + 1)*sample_count, LessThanIdx<float, unsigned short>(valCache->ptr<float>(fi)) );
            else
-                std::sort(idst + fi*sample_count, idst + (fi + 1)*sample_count, LessThanIdx<float, int>(valCache->ptr<float>(fi)) );
+                std::sort(idst + (size_t)fi*sample_count, idst + (size_t)(fi + 1)*sample_count, LessThanIdx<float, int>(valCache->ptr<float>(fi)) );
        }
    }
    const CvFeatureEvaluator* featureEvaluator;
@ -1228,9 +1228,9 @@ void CvCascadeBoostTree::split_node_data( CvDTreeNode* node )
    if (data->is_buf_16u)
    {
        unsigned short *ldst = (unsigned short *)(buf->data.s + left->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + left->offset);
+            (size_t)(workVarCount-1)*scount + left->offset);
        unsigned short *rdst = (unsigned short *)(buf->data.s + right->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + right->offset);
+            (size_t)(workVarCount-1)*scount + right->offset);

        for( int i = 0; i < n; i++ )
        {
@ -1251,9 +1251,9 @@ void CvCascadeBoostTree::split_node_data( CvDTreeNode* node )
    else
    {
        int *ldst = buf->data.i + left->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + left->offset;
+            (size_t)(workVarCount-1)*scount + left->offset;
        int *rdst = buf->data.i + right->buf_idx*length_buf_row +
-            (workVarCount-1)*scount + right->offset;
+            (size_t)(workVarCount-1)*scount + right->offset;

        for( int i = 0; i < n; i++ )
        {
@ -1281,9 +1281,9 @@ void CvCascadeBoostTree::split_node_data( CvDTreeNode* node )
    if (data->is_buf_16u)
    {
        unsigned short* ldst = (unsigned short*)(buf->data.s + left->buf_idx*length_buf_row +
-            workVarCount*scount + left->offset);
+            (size_t)workVarCount*scount + left->offset);
        unsigned short* rdst = (unsigned short*)(buf->data.s + right->buf_idx*length_buf_row +
-            workVarCount*scount + right->offset);
+            (size_t)workVarCount*scount + right->offset);
        for (int i = 0; i < n; i++)
        {
            unsigned short idx = (unsigned short)tempBuf[i];
@ -1302,9 +1302,9 @@ void CvCascadeBoostTree::split_node_data( CvDTreeNode* node )
    else
    {
        int* ldst = buf->data.i + left->buf_idx*length_buf_row +
-            workVarCount*scount + left->offset;
+            (size_t)workVarCount*scount + left->offset;
        int* rdst = buf->data.i + right->buf_idx*length_buf_row +
-            workVarCount*scount + right->offset;
+            (size_t)workVarCount*scount + right->offset;
        for (int i = 0; i < n; i++)
        {
            int idx = tempBuf[i];
@ -1473,7 +1473,7 @@ void CvCascadeBoost::update_weights( CvBoostTree* tree )
        if (data->is_buf_16u)
        {
            unsigned short* labels = (unsigned short*)(buf->data.s + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count);
+                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count);
            for( int i = 0; i < n; i++ )
            {
                // save original categorical responses {0,1}, convert them to {-1,1}
@ -1491,7 +1491,7 @@ void CvCascadeBoost::update_weights( CvBoostTree* tree )
        else
        {
            int* labels = buf->data.i + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count;
+                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count;

            for( int i = 0; i < n; i++ )
            {
--- a/apps/traincascade/cascadeclassifier.cpp
+++ b/apps/traincascade/cascadeclassifier.cpp
@ -135,7 +135,8 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
                                const CvCascadeParams& _cascadeParams,
                                const CvFeatureParams& _featureParams,
                                const CvCascadeBoostParams& _stageParams,
-                                bool baseFormatSave )
+                                bool baseFormatSave,
+                                double acceptanceRatioBreakValue )
 {
    // Start recording clock ticks for training time output
    const clock_t begin_time = clock();
@ -185,6 +186,7 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
    cout << "numStages: " << numStages << endl;
    cout << "precalcValBufSize[Mb] : " << _precalcValBufSize << endl;
    cout << "precalcIdxBufSize[Mb] : " << _precalcIdxBufSize << endl;
+    cout << "acceptanceRatioBreakValue : " << acceptanceRatioBreakValue << endl;
    cascadeParams.printAttrs();
    stageParams->printAttrs();
    featureParams->printAttrs();
@ -207,13 +209,18 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
        if ( !updateTrainingSet( tempLeafFARate ) )
        {
            cout << "Train dataset for temp stage can not be filled. "
-                "Branch training terminated." << endl;
+                    "Branch training terminated." << endl;
            break;
        }
        if( tempLeafFARate <= requiredLeafFARate )
        {
            cout << "Required leaf false alarm rate achieved. "
-                 "Branch training terminated." << endl;
+                    "Branch training terminated." << endl;
+            break;
+        }
+        if( (tempLeafFARate <= acceptanceRatioBreakValue) && (acceptanceRatioBreakValue >= 0) ){
+            cout << "The required acceptanceRatio for the model has been reached to avoid overfitting of trainingdata. "
+                    "Branch training terminated." << endl;
            break;
        }

--- a/apps/traincascade/cascadeclassifier.h
+++ b/apps/traincascade/cascadeclassifier.h
@ -94,7 +94,8 @@ public:
                const CvCascadeParams& _cascadeParams,
                const CvFeatureParams& _featureParams,
                const CvCascadeBoostParams& _stageParams,
-                bool baseFormatSave = false );
+                bool baseFormatSave = false,
+                double acceptanceRatioBreakValue = -1.0 );
 private:
    int predict( int sampleIdx );
    void save( const std::string cascadeDirName, bool baseFormat = false );
--- a/apps/traincascade/old_ml_boost.cpp
+++ b/apps/traincascade/old_ml_boost.cpp
@ -1200,7 +1200,7 @@ CvBoost::update_weights( CvBoostTree* tree )
        if (data->is_buf_16u)
        {
            unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count);
+                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count);
            for( i = 0; i < n; i++ )
            {
                // save original categorical responses {0,1}, convert them to {-1,1}
@ -1218,7 +1218,7 @@ CvBoost::update_weights( CvBoostTree* tree )
        else
        {
            int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*length_buf_row +
-                data->data_root->offset + (data->work_var_count-1)*data->sample_count;
+                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count;

            for( i = 0; i < n; i++ )
            {
--- a/apps/traincascade/old_ml_inner_functions.cpp
+++ b/apps/traincascade/old_ml_inner_functions.cpp
@ -82,7 +82,7 @@ void CvStatModel::load( const char* filename, const char* name )
 {
    CvFileStorage* fs = 0;

-    CV_FUNCNAME( "CvStatModel::load" );
+    CV_FUNCNAME( "CvAlgorithm::load" );

    __BEGIN__;

--- a/apps/traincascade/old_ml_tree.cpp
+++ b/apps/traincascade/old_ml_tree.cpp
@ -424,9 +424,9 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
            int* c_map;

            if (is_buf_16u)
-                udst = (unsigned short*)(buf->data.s + vi*sample_count);
+                udst = (unsigned short*)(buf->data.s + (size_t)vi*sample_count);
            else
-                idst = buf->data.i + vi*sample_count;
+                idst = buf->data.i + (size_t)vi*sample_count;

            // copy data
            for( i = 0; i < sample_count; i++ )
@ -540,9 +540,9 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
        else if( ci < 0 ) // process ordered variable
        {
            if (is_buf_16u)
-                udst = (unsigned short*)(buf->data.s + vi*sample_count);
+                udst = (unsigned short*)(buf->data.s + (size_t)vi*sample_count);
            else
-                idst = buf->data.i + vi*sample_count;
+                idst = buf->data.i + (size_t)vi*sample_count;

            for( i = 0; i < sample_count; i++ )
            {
@ -583,9 +583,9 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,

    // set sample labels
    if (is_buf_16u)
-        udst = (unsigned short*)(buf->data.s + work_var_count*sample_count);
+        udst = (unsigned short*)(buf->data.s + (size_t)work_var_count*sample_count);
    else
-        idst = buf->data.i + work_var_count*sample_count;
+        idst = buf->data.i + (size_t)work_var_count*sample_count;

    for (i = 0; i < sample_count; i++)
    {
@ -602,7 +602,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,

        if (is_buf_16u)
        {
-            usdst = (unsigned short*)(buf->data.s + (get_work_var_count()-1)*sample_count);
+            usdst = (unsigned short*)(buf->data.s + (size_t)(get_work_var_count()-1)*sample_count);
            for( i = vi = 0; i < sample_count; i++ )
            {
                usdst[i] = (unsigned short)vi++;
@ -619,7 +619,7 @@ void CvDTreeTrainData::set_data( const CvMat* _train_data, int _tflag,
        }
        else
        {
-            idst2 = buf->data.i + (get_work_var_count()-1)*sample_count;
+            idst2 = buf->data.i + (size_t)(get_work_var_count()-1)*sample_count;
            for( i = vi = 0; i < sample_count; i++ )
            {
                idst2[i] = vi++;
@ -785,7 +785,7 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
                if (is_buf_16u)
                {
                    unsigned short* udst = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                        vi*sample_count + root->offset);
+                        (size_t)vi*sample_count + root->offset);
                    for( i = 0; i < count; i++ )
                    {
                        int val = src[sidx[i]];
@ -796,7 +796,7 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
                else
                {
                    int* idst = buf->data.i + root->buf_idx*get_length_subbuf() +
-                        vi*sample_count + root->offset;
+                        (size_t)vi*sample_count + root->offset;
                    for( i = 0; i < count; i++ )
                    {
                        int val = src[sidx[i]];
@ -822,7 +822,7 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
                if (is_buf_16u)
                {
                    unsigned short* udst_idx = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                        vi*sample_count + data_root->offset);
+                        (size_t)vi*sample_count + data_root->offset);
                    for( i = 0; i < num_valid; i++ )
                    {
                        idx = src_idx[i];
@ -846,7 +846,7 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
                else
                {
                    int* idst_idx = buf->data.i + root->buf_idx*get_length_subbuf() +
-                        vi*sample_count + root->offset;
+                        (size_t)vi*sample_count + root->offset;
                    for( i = 0; i < num_valid; i++ )
                    {
                        idx = src_idx[i];
@ -874,14 +874,14 @@ CvDTreeNode* CvDTreeTrainData::subsample_data( const CvMat* _subsample_idx )
        if (is_buf_16u)
        {
            unsigned short* sample_idx_dst = (unsigned short*)(buf->data.s + root->buf_idx*get_length_subbuf() +
-                workVarCount*sample_count + root->offset);
+                (size_t)workVarCount*sample_count + root->offset);
            for (i = 0; i < count; i++)
                sample_idx_dst[i] = (unsigned short)sample_idx_src[sidx[i]];
        }
        else
        {
            int* sample_idx_dst = buf->data.i + root->buf_idx*get_length_subbuf() +
-                workVarCount*sample_count + root->offset;
+                (size_t)workVarCount*sample_count + root->offset;
            for (i = 0; i < count; i++)
                sample_idx_dst[i] = sample_idx_src[sidx[i]];
        }
@ -1192,10 +1192,10 @@ void CvDTreeTrainData::get_ord_var_data( CvDTreeNode* n, int vi, float* ord_valu

    if( !is_buf_16u )
        *sorted_indices = buf->data.i + n->buf_idx*get_length_subbuf() +
-        vi*sample_count + n->offset;
+        (size_t)vi*sample_count + n->offset;
    else {
        const unsigned short* short_indices = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() +
-            vi*sample_count + n->offset );
+            (size_t)vi*sample_count + n->offset );
        for( int i = 0; i < node_sample_count; i++ )
            sorted_indices_buf[i] = short_indices[i];
        *sorted_indices = sorted_indices_buf;
@ -1266,10 +1266,10 @@ const int* CvDTreeTrainData::get_cat_var_data( CvDTreeNode* n, int vi, int* cat_
    const int* cat_values = 0;
    if( !is_buf_16u )
        cat_values = buf->data.i + n->buf_idx*get_length_subbuf() +
-            vi*sample_count + n->offset;
+            (size_t)vi*sample_count + n->offset;
    else {
        const unsigned short* short_values = (const unsigned short*)(buf->data.s + n->buf_idx*get_length_subbuf() +
-            vi*sample_count + n->offset);
+            (size_t)vi*sample_count + n->offset);
        for( int i = 0; i < n->sample_count; i++ )
            cat_values_buf[i] = short_values[i];
        cat_values = cat_values_buf;
--- a/apps/traincascade/traincascade.cpp
+++ b/apps/traincascade/traincascade.cpp
@ -15,6 +15,7 @@ int main( int argc, char* argv[] )
    int precalcValBufSize = 1024,
        precalcIdxBufSize = 1024;
    bool baseFormatSave = false;
+    double acceptanceRatioBreakValue = -1.0;

    CvCascadeParams cascadeParams;
    CvCascadeBoostParams stageParams;
@ -36,6 +37,7 @@ int main( int argc, char* argv[] )
        cout << "  [-precalcIdxBufSize <precalculated_idxs_buffer_size_in_Mb = " << precalcIdxBufSize << ">]" << endl;
        cout << "  [-baseFormatSave]" << endl;
        cout << "  [-numThreads <max_number_of_threads = " << numThreads << ">]" << endl;
+        cout << "  [-acceptanceRatioBreakValue <value> = " << acceptanceRatioBreakValue << ">]" << endl;
        cascadeParams.printDefaults();
        stageParams.printDefaults();
        for( int fi = 0; fi < fc; fi++ )
@ -86,6 +88,10 @@ int main( int argc, char* argv[] )
        {
          numThreads = atoi(argv[++i]);
        }
+        else if( !strcmp( argv[i], "-acceptanceRatioBreakValue" ) )
+        {
+          acceptanceRatioBreakValue = atof(argv[++i]);
+        }
        else if ( cascadeParams.scanAttr( argv[i], argv[i+1] ) ) { i++; }
        else if ( stageParams.scanAttr( argv[i], argv[i+1] ) ) { i++; }
        else if ( !set )
@ -112,6 +118,7 @@ int main( int argc, char* argv[] )
                      cascadeParams,
                      *featureParams[cascadeParams.featureType],
                      stageParams,
-                      baseFormatSave );
+                      baseFormatSave,
+                      acceptanceRatioBreakValue );
    return 0;
 }
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@ -619,6 +619,8 @@ if(DEFINED CUDA_TARGET_CPU_ARCH)
  set(_cuda_target_cpu_arch_initial "${CUDA_TARGET_CPU_ARCH}")
 elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|ARM)")
  set(_cuda_target_cpu_arch_initial "ARM")
+elseif(CUDA_VERSION VERSION_GREATER "6.5" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64)")
+  set(_cuda_target_cpu_arch_initial "AARCH64")
 else()
  set(_cuda_target_cpu_arch_initial "")
 endif()
@ -643,6 +645,12 @@ elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND "${CUDA_T
  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
    set(_cuda_target_triplet_initial "armv7-linux-gnueabihf")
  endif()
+elseif(CUDA_VERSION VERSION_GREATER "6.5" AND CMAKE_CROSSCOMPILING AND "${CUDA_TARGET_CPU_ARCH}" STREQUAL "AARCH64")
+  if("${CUDA_TARGET_OS_VARIANT}" STREQUAL "Android" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux-androideabi")
+    set(_cuda_target_triplet_initial "aarch64-linux-androideabi")
+  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux-gnueabihf")
+    set(_cuda_target_triplet_initial "aarch64-linux-gnueabihf")
+  endif()
 endif()
 set(CUDA_TARGET_TRIPLET "${_cuda_target_triplet_initial}" CACHE STRING "Specify the target triplet for which the input files must be compiled.")
 file(GLOB __cuda_available_target_tiplets RELATIVE "${CUDA_TOOLKIT_ROOT_DIR}/targets" "${CUDA_TOOLKIT_ROOT_DIR}/targets/*" )
@ -1094,8 +1102,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
    set(nvcc_flags ${nvcc_flags} -m32)
  endif()

-  if(CUDA_TARGET_CPU_ARCH)
-    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
+  if(CUDA_TARGET_CPU_ARCH AND CUDA_VERSION VERSION_LESS "7.0")
+    # CPU architecture is either ARM or X86. Patch AARCH64 to be ARM
+    string(REPLACE "AARCH64" "ARM" CUDA_TARGET_CPU_ARCH_patched ${CUDA_TARGET_CPU_ARCH})
+    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH_patched}")
  endif()

  if(CUDA_TARGET_OS_VARIANT AND CUDA_VERSION VERSION_LESS "7.0")
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -98,6 +98,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
    add_extra_compiler_option(-pthread)
  endif()

+  if(CMAKE_COMPILER_IS_CLANGCXX)
+    add_extra_compiler_option(-Qunused-arguments)
+  endif()
+
  if(OPENCV_WARNINGS_ARE_ERRORS)
    add_extra_compiler_option(-Werror)
  endif()
@ -127,6 +131,8 @@ if(CMAKE_COMPILER_IS_GNUCXX)
  endif()
  if(ENABLE_SSE2)
    add_extra_compiler_option(-msse2)
+  elseif(X86 OR X86_64)
+    add_extra_compiler_option(-mno-sse2)
  endif()
  if(ENABLE_NEON)
    add_extra_compiler_option("-mfpu=neon")
@ -139,6 +145,8 @@ if(CMAKE_COMPILER_IS_GNUCXX)
  if(NOT MINGW)
    if(ENABLE_AVX)
      add_extra_compiler_option(-mavx)
+    elseif(X86 OR X86_64)
+      add_extra_compiler_option(-mno-avx)
    endif()
    if(ENABLE_AVX2)
      add_extra_compiler_option(-mavx2)
@ -152,18 +160,26 @@ if(CMAKE_COMPILER_IS_GNUCXX)
    if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")
      if(ENABLE_SSE3)
        add_extra_compiler_option(-msse3)
+      elseif(X86 OR X86_64)
+        add_extra_compiler_option(-mno-sse3)
      endif()

      if(ENABLE_SSSE3)
        add_extra_compiler_option(-mssse3)
+      elseif(X86 OR X86_64)
+        add_extra_compiler_option(-mno-ssse3)
      endif()

      if(ENABLE_SSE41)
        add_extra_compiler_option(-msse4.1)
+      elseif(X86 OR X86_64)
+        add_extra_compiler_option(-mno-sse4.1)
      endif()

      if(ENABLE_SSE42)
        add_extra_compiler_option(-msse4.2)
+      elseif(X86 OR X86_64)
+        add_extra_compiler_option(-mno-sse4.2)
      endif()

      if(ENABLE_POPCNT)
@ -265,6 +281,11 @@ if(MSVC)
  endif()
 endif()

+if(MSVC12 AND NOT CMAKE_GENERATOR MATCHES "Visual Studio")
+  set(OPENCV_EXTRA_C_FLAGS "${OPENCV_EXTRA_C_FLAGS} /FS")
+  set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} /FS")
+endif()
+
 # Extra link libs if the user selects building static libs:
 if(NOT BUILD_SHARED_LIBS AND CMAKE_COMPILER_IS_GNUCXX AND NOT ANDROID)
  # Android does not need these settings because they are already set by toolchain file
--- a/cmake/OpenCVConfig.cmake
+++ b/cmake/OpenCVConfig.cmake
@ -47,7 +47,7 @@ endif()

 if(NOT DEFINED OpenCV_STATIC)
  # look for global setting
-  if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
+  if(BUILD_SHARED_LIBS)
    set(OpenCV_STATIC OFF)
  else()
    set(OpenCV_STATIC ON)
@ -89,7 +89,7 @@ elseif(MINGW)
  execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine
                  OUTPUT_VARIABLE OPENCV_GCC_TARGET_MACHINE
                  OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "64")
+  if(OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
    set(MINGW64 1)
    set(OpenCV_ARCH x64)
  else()
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@ -91,9 +91,9 @@ elseif(CMAKE_COMPILER_IS_GNUCXX)

  if(WIN32)
    execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine
-              OUTPUT_VARIABLE CMAKE_OPENCV_GCC_TARGET_MACHINE
+              OUTPUT_VARIABLE OPENCV_GCC_TARGET_MACHINE
              OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
+    if(OPENCV_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
      set(MINGW64 1)
    endif()
  endif()
@ -114,7 +114,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
 endif()


-# Similar code is existed in OpenCVConfig.cmake
+# Similar code exists in OpenCVConfig.cmake
 if(NOT DEFINED OpenCV_STATIC)
  # look for global setting
  if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
@ -147,11 +147,7 @@ if(MSVC)
 elseif(MINGW)
  set(OpenCV_RUNTIME mingw)

-  execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpmachine
-                  OUTPUT_VARIABLE OPENCV_GCC_TARGET_MACHINE
-                  OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(CMAKE_OPENCV_GCC_TARGET_MACHINE MATCHES "64")
-    set(MINGW64 1)
+  if(MINGW64)
    set(OpenCV_ARCH x64)
  else()
    set(OpenCV_ARCH x86)
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@ -187,7 +187,7 @@ if(WITH_XIMEA)
 endif(WITH_XIMEA)

 # --- FFMPEG ---
-ocv_clear_vars(HAVE_FFMPEG HAVE_FFMPEG_CODEC HAVE_FFMPEG_FORMAT HAVE_FFMPEG_UTIL HAVE_FFMPEG_SWSCALE HAVE_GENTOO_FFMPEG HAVE_FFMPEG_FFMPEG)
+ocv_clear_vars(HAVE_FFMPEG HAVE_FFMPEG_CODEC HAVE_FFMPEG_FORMAT HAVE_FFMPEG_UTIL HAVE_FFMPEG_SWSCALE HAVE_FFMPEG_RESAMPLE HAVE_GENTOO_FFMPEG HAVE_FFMPEG_FFMPEG)
 if(WITH_FFMPEG)
  if(WIN32 AND NOT ARM)
    include("${OpenCV_SOURCE_DIR}/3rdparty/ffmpeg/ffmpeg_version.cmake")
@ -196,6 +196,7 @@ if(WITH_FFMPEG)
    CHECK_MODULE(libavformat HAVE_FFMPEG_FORMAT)
    CHECK_MODULE(libavutil HAVE_FFMPEG_UTIL)
    CHECK_MODULE(libswscale HAVE_FFMPEG_SWSCALE)
+    CHECK_MODULE(libavresample HAVE_FFMPEG_RESAMPLE)

    CHECK_INCLUDE_FILE(libavformat/avformat.h HAVE_GENTOO_FFMPEG)
    CHECK_INCLUDE_FILE(ffmpeg/avformat.h HAVE_FFMPEG_FFMPEG)
@ -215,42 +216,43 @@ if(WITH_FFMPEG)
        # Do an other trial
        FIND_FILE(BZIP2_LIBRARIES NAMES libbz2.so.1 PATHS /lib)
      endif()
-    endif(HAVE_FFMPEG)
-  endif()
-
-  if(APPLE)
-    find_path(FFMPEG_INCLUDE_DIR "libavformat/avformat.h"
-              PATHS /usr/local /usr /opt
-              PATH_SUFFIXES include
-              DOC "The path to FFMPEG headers")
-    if(FFMPEG_INCLUDE_DIR)
-      set(HAVE_GENTOO_FFMPEG TRUE)
-      set(FFMPEG_LIB_DIR "${FFMPEG_INCLUDE_DIR}/../lib" CACHE PATH "Full path of FFMPEG library directory")
-      if(EXISTS "${FFMPEG_LIB_DIR}/libavcodec.a")
-        set(HAVE_FFMPEG_CODEC 1)
-        set(ALIASOF_libavcodec_VERSION "Unknown")
-        if(EXISTS "${FFMPEG_LIB_DIR}/libavformat.a")
-          set(HAVE_FFMPEG_FORMAT 1)
+    else()
+      find_path(FFMPEG_INCLUDE_DIR "libavformat/avformat.h"
+                PATHS /usr/local /usr /opt
+                PATH_SUFFIXES include
+                DOC "The path to FFMPEG headers")
+      if(FFMPEG_INCLUDE_DIR)
+        set(HAVE_GENTOO_FFMPEG TRUE)
+        set(FFMPEG_LIB_DIR "${FFMPEG_INCLUDE_DIR}/../lib" CACHE PATH "Full path of FFMPEG library directory")
+        find_library(FFMPEG_CODEC_LIB "avcodec" HINTS "${FFMPEG_LIB_DIR}")
+        find_library(FFMPEG_FORMAT_LIB "avformat" HINTS "${FFMPEG_LIB_DIR}")
+        find_library(FFMPEG_UTIL_LIB "avutil" HINTS "${FFMPEG_LIB_DIR}")
+        find_library(FFMPEG_SWSCALE_LIB "swscale" HINTS "${FFMPEG_LIB_DIR}")
+        find_library(FFMPEG_RESAMPLE_LIB "avresample" HINTS "${FFMPEG_LIB_DIR}")
+        if(FFMPEG_CODEC_LIB AND FFMPEG_FORMAT_LIB AND
+           FFMPEG_UTIL_LIB AND FFMPEG_SWSCALE_LIB)
+          set(ALIASOF_libavcodec_VERSION "Unknown")
          set(ALIASOF_libavformat_VERSION "Unknown")
-          if(EXISTS "${FFMPEG_LIB_DIR}/libavutil.a")
-            set(HAVE_FFMPEG_UTIL 1)
-            set(ALIASOF_libavutil_VERSION "Unknown")
-            if(EXISTS "${FFMPEG_LIB_DIR}/libswscale.a")
-              set(HAVE_FFMPEG_SWSCALE 1)
-              set(ALIASOF_libswscale_VERSION "Unknown")
-              set(HAVE_FFMPEG 1)
-            endif()
+          set(ALIASOF_libavutil_VERSION "Unknown")
+          set(ALIASOF_libswscale_VERSION "Unknown")
+          set(HAVE_FFMPEG 1)
+          if(FFMPEG_RESAMPLE_LIB)
+            set(HAVE_FFMPEG_RESAMPLE 1)
+            set(ALIASOF_libavresample_VERSION "Unknown")
          endif()
        endif()
-      endif()
-    endif(FFMPEG_INCLUDE_DIR)
-    if(HAVE_FFMPEG)
-      set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES} "${FFMPEG_LIB_DIR}/libavcodec.a"
-          "${FFMPEG_LIB_DIR}/libavformat.a" "${FFMPEG_LIB_DIR}/libavutil.a"
-          "${FFMPEG_LIB_DIR}/libswscale.a")
-      ocv_include_directories(${FFMPEG_INCLUDE_DIR})
+      endif(FFMPEG_INCLUDE_DIR)
+      if(HAVE_FFMPEG)
+        set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES} "${FFMPEG_LIB_DIR}/libavcodec.a"
+            "${FFMPEG_LIB_DIR}/libavformat.a" "${FFMPEG_LIB_DIR}/libavutil.a"
+            "${FFMPEG_LIB_DIR}/libswscale.a")
+        if(HAVE_FFMPEG_RESAMPLE)
+          set(VIDEOIO_LIBRARIES ${VIDEOIO_LIBRARIES} "${FFMPEG_LIB_DIR}/libavresample.a")
+        endif()
+        ocv_include_directories(${FFMPEG_INCLUDE_DIR})
+      endif(HAVE_FFMPEG)
    endif()
-  endif(APPLE)
+  endif()
 endif(WITH_FFMPEG)

 # --- VideoInput/DirectShow ---
--- a/cmake/OpenCVGenAndroidMK.cmake
+++ b/cmake/OpenCVGenAndroidMK.cmake
@ -43,34 +43,17 @@ if(ANDROID)
  endforeach()

  # build the list of opencv libs and dependencies for all modules
-  set(OPENCV_MODULES_CONFIGMAKE "")
-  set(OPENCV_EXTRA_COMPONENTS_CONFIGMAKE "")
-  set(OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE "")
-  foreach(m ${OPENCV_MODULES_PUBLIC})
-    list(INSERT OPENCV_MODULES_CONFIGMAKE 0 ${${m}_MODULE_DEPS_${ocv_optkind}} ${m})
-    if(${m}_EXTRA_DEPS_${ocv_optkind})
-      list(INSERT OPENCV_EXTRA_COMPONENTS_CONFIGMAKE 0 ${${m}_EXTRA_DEPS_${ocv_optkind}})
-    endif()
-  endforeach()
+  ocv_get_all_libs(OPENCV_MODULES_CONFIGMAKE OPENCV_EXTRA_COMPONENTS_CONFIGMAKE OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE)

-  # split 3rdparty libs and modules
-  foreach(mod ${OPENCV_MODULES_CONFIGMAKE})
-    if(NOT mod MATCHES "^opencv_.+$")
-      list(INSERT OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE 0 ${mod})
-    endif()
-  endforeach()
-  if(OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE)
-    list(REMOVE_ITEM OPENCV_MODULES_CONFIGMAKE ${OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE})
-  endif()
+  # list -> string
+  string(REPLACE ";" " " OPENCV_MODULES_CONFIGMAKE "${OPENCV_MODULES_CONFIGMAKE}")
+  string(REPLACE ";" " " OPENCV_EXTRA_COMPONENTS_CONFIGMAKE "${OPENCV_EXTRA_COMPONENTS_CONFIGMAKE}")
+  string(REPLACE ";" " " OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE "${OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE}")

-  # convert CMake lists to makefile literals
-  foreach(lst OPENCV_MODULES_CONFIGMAKE OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE OPENCV_EXTRA_COMPONENTS_CONFIGMAKE)
-    ocv_list_unique(${lst})
-    ocv_list_reverse(${lst})
-    string(REPLACE ";" " " ${lst} "${${lst}}")
-  endforeach()
+  # replace 'opencv_<module>' -> '<module>''
  string(REPLACE "opencv_" "" OPENCV_MODULES_CONFIGMAKE "${OPENCV_MODULES_CONFIGMAKE}")

+
  # prepare 3rd-party component list without TBB for armeabi and mips platforms. TBB is useless there.
  set(OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE_NO_TBB ${OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE})
  foreach(mod ${OPENCV_3RDPARTY_COMPONENTS_CONFIGMAKE_NO_TBB})
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -176,6 +176,11 @@ macro(ocv_add_module _name)
      endif()
    endif()

+    # add HAL as dependency
+    if(NOT "${the_module}" STREQUAL "opencv_hal")
+      ocv_add_dependencies(${the_module} opencv_hal)
+    endif()
+
    # add self to the world dependencies
    if((NOT DEFINED OPENCV_MODULE_IS_PART_OF_WORLD
        AND NOT OPENCV_MODULE_${the_module}_CLASS STREQUAL "BINDINGS"
@ -517,6 +522,18 @@ macro(ocv_include_modules)
  endforeach()
 endmacro()

+# same as previous but with dependencies
+macro(ocv_include_modules_recurse)
+  ocv_include_modules(${ARGN})
+  foreach(d ${ARGN})
+    if(d MATCHES "^opencv_" AND HAVE_${d} AND DEFINED OPENCV_MODULE_${d}_DEPS)
+      foreach (sub ${OPENCV_MODULE_${d}_DEPS})
+        ocv_include_modules(${sub})
+      endforeach()
+    endif()
+  endforeach()
+endmacro()
+
 # setup include paths for the list of passed modules
 macro(ocv_target_include_modules target)
  foreach(d ${ARGN})
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@ -415,31 +415,6 @@ function(status text)
 endfunction()


-# splits cmake libraries list of format "general;item1;debug;item2;release;item3" to two lists
-macro(ocv_split_libs_list lst lstdbg lstopt)
-  set(${lstdbg} "")
-  set(${lstopt} "")
-  set(perv_keyword "")
-  foreach(word ${${lst}})
-    if(word STREQUAL "debug" OR word STREQUAL "optimized")
-      set(perv_keyword ${word})
-    elseif(word STREQUAL "general")
-      set(perv_keyword "")
-    elseif(perv_keyword STREQUAL "debug")
-      list(APPEND ${lstdbg} "${word}")
-      set(perv_keyword "")
-    elseif(perv_keyword STREQUAL "optimized")
-      list(APPEND ${lstopt} "${word}")
-      set(perv_keyword "")
-    else()
-      list(APPEND ${lstdbg} "${word}")
-      list(APPEND ${lstopt} "${word}")
-      set(perv_keyword "")
-    endif()
-  endforeach()
-endmacro()
-
-
 # remove all matching elements from the list
 macro(ocv_list_filterout lst regex)
  foreach(item ${${lst}})
@ -810,3 +785,35 @@ function(ocv_add_library target)

  _ocv_append_target_includes(${target})
 endfunction()
+
+# build the list of opencv libs and dependencies for all modules
+#  _modules - variable to hold list of all modules
+#  _extra - variable to hold list of extra dependencies
+#  _3rdparty - variable to hold list of prebuilt 3rdparty libraries
+macro(ocv_get_all_libs _modules _extra _3rdparty)
+  set(${_modules} "")
+  set(${_extra} "")
+  set(${_3rdparty} "")
+  foreach(m ${OPENCV_MODULES_PUBLIC})
+    get_target_property(deps ${m} INTERFACE_LINK_LIBRARIES)
+    list(INSERT ${_modules} 0 ${deps} ${m})
+    foreach (dep ${deps} ${OPENCV_LINKER_LIBS})
+      if (NOT DEFINED OPENCV_MODULE_${dep}_LOCATION)
+        if (TARGET ${dep})
+          list(INSERT ${_3rdparty} 0 ${dep})
+        else()
+          list(INSERT ${_extra} 0 ${dep})
+        endif()
+      endif()
+    endforeach()
+  endforeach()
+
+  # split 3rdparty libs and modules
+  list(REMOVE_ITEM ${_modules} ${${_3rdparty}} ${${_extra}})
+
+  # convert CMake lists to makefile literals
+  foreach(lst ${_modules} ${_3rdparty} ${_extra})
+    ocv_list_unique(${lst})
+    ocv_list_reverse(${lst})
+  endforeach()
+endmacro()
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@ -77,6 +77,13 @@ if("@USE_IPPICV@" STREQUAL "TRUE") # value is defined by package builder (use ST
 endif()

 if(NOT TARGET opencv_core)
+  # Extract directory name from full path of the file currently being processed.
+  # Note that CMake 2.8.3 introduced CMAKE_CURRENT_LIST_DIR. We reimplement it
+  # for older versions of CMake to support these as well.
+  if(CMAKE_VERSION VERSION_LESS "2.8.3")
+    get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  endif()
+
  include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${OpenCV_MODULES_SUFFIX}.cmake)
 endif()

@ -388,6 +395,10 @@ macro(ocv_include_modules)
  include_directories(BEFORE "${OpenCV_INCLUDE_DIRS}")
 endmacro()

+macro(ocv_include_modules_recurse)
+  include_directories(BEFORE "${OpenCV_INCLUDE_DIRS}")
+endmacro()
+
 macro(ocv_target_link_libraries)
  target_link_libraries(${ARGN})
 endmacro()
--- a/cmake/templates/opencv_abi.xml.in
+++ b/cmake/templates/opencv_abi.xml.in
@ -21,6 +21,7 @@
 </libs>

 <skip_headers>
+    opencv2/hal/intrin*
    opencv2/core/cuda*
    opencv2/core/private*
    opencv/cxeigen.hpp
--- a/doc/pattern_tools/gen_pattern.py
+++ b/doc/pattern_tools/gen_pattern.py
@ -1,13 +1,19 @@
 #!/usr/bin/env python

 """gen_pattern.py
-To run:
-c 10 -r 12 -o out.svg
-T type of pattern, circles, acircles, checkerboard
-s --square_size size of squares in pattern
-u --units mm, inches, px, m
-w  page width in units
-h  page height in units
+Usage example:
+python gen_pattern.py -o out.svg -r 11 -c 8 -T circles -s 20.0 -R 5.0 -u mm -w 216 -h 279
+
+-o, --output - output file (default out.svg)
+-r, --rows - pattern rows (default 11)
+-c, --columns - pattern columns (default 8)
+-T, --type - type of pattern, circles, acircles, checkerboard (default circles)
+-s, --square_size - size of squares in pattern (default 20.0)
+-R, --radius_rate - circles_radius = square_size/radius_rate (default 5.0)
+-u, --units - mm, inches, px, m (default mm)
+-w, --page_width - page width in units (default 216)
+-h, --page_height - page height in units (default 279)
+-H, --help - show help
 """

 from svgfig import *
@ -16,18 +22,20 @@ import sys
 import getopt

 class PatternMaker:
-  def __init__(self, cols,rows,output,units,square_size,page_width,page_height):
+  def __init__(self, cols,rows,output,units,square_size,radius_rate,page_width,page_height):
    self.cols = cols
    self.rows = rows
    self.output = output
    self.units = units
    self.square_size = square_size
+    self.radius_rate = radius_rate
    self.width = page_width
    self.height = page_height
    self.g = SVG("g") # the svg group container
+
  def makeCirclesPattern(self):
    spacing = self.square_size
-    r = spacing / 5.0 #radius is a 5th of the spacing TODO parameterize
+    r = spacing / self.radius_rate
    for x in range(1,self.cols+1):
      for y in range(1,self.rows+1):
        dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black")
@ -35,7 +43,7 @@ class PatternMaker:

  def makeACirclesPattern(self):
    spacing = self.square_size
-    r = spacing / 5.0
+    r = spacing / self.radius_rate
    for i in range(0,self.rows):
      for j in range(0,self.cols):
        dot = SVG("circle", cx= ((j*2 + i%2)*spacing) + spacing, cy=self.height - (i * spacing + spacing), r=r, fill="black")
@ -43,37 +51,23 @@ class PatternMaker:

  def makeCheckerboardPattern(self):
    spacing = self.square_size
-    r = spacing / 5.0
    for x in range(1,self.cols+1):
      for y in range(1,self.rows+1):
-        #TODO make a checkerboard pattern
-        dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black")
-        self.g.append(dot)
+        if x%2 == y%2:
+          dot = SVG("rect", x=x * spacing, y=y * spacing, width=spacing, height=spacing, stroke_width="0", fill="black")
+          self.g.append(dot)
+
  def save(self):
    c = canvas(self.g,width="%d%s"%(self.width,self.units),height="%d%s"%(self.height,self.units),viewBox="0 0 %d %d"%(self.width,self.height))
    c.inkview(self.output)

-def makePattern(cols,rows,output,p_type,units,square_size,page_width,page_height):
-    width = page_width
-    spacing = square_size
-    height = page_height
-    r = spacing / 5.0
-    g = SVG("g") # the svg group container
-    for x in range(1,cols+1):
-      for y in range(1,rows+1):
-        if "circle" in p_type:
-          dot = SVG("circle", cx=x * spacing, cy=y * spacing, r=r, fill="black")
-        g.append(dot)
-    c = canvas(g,width="%d%s"%(width,units),height="%d%s"%(height,units),viewBox="0 0 %d %d"%(width,height))
-    c.inkview(output)
-

 def main():
    # parse command line options, TODO use argparse for better doc
    try:
-        opts, args = getopt.getopt(sys.argv[1:], "ho:c:r:T:u:s:w:h:", ["help","output","columns","rows",
-                                                                      "type","units","square_size","page_width",
-                                                                      "page_height"])
+        opts, args = getopt.getopt(sys.argv[1:], "Ho:c:r:T:u:s:R:w:h:", ["help","output=","columns=","rows=",
+                                                                      "type=","units=","square_size=","radius_rate=",
+                                                                      "page_width=","page_height="])
    except getopt.error, msg:
        print msg
        print "for help use --help"
@ -84,11 +78,12 @@ def main():
    p_type = "circles"
    units = "mm"
    square_size = 20.0
+    radius_rate = 5.0
    page_width = 216    #8.5 inches
    page_height = 279   #11 inches
    # process options
    for o, a in opts:
-        if o in ("-h", "--help"):
+        if o in ("-H", "--help"):
            print __doc__
            sys.exit(0)
        elif o in ("-r", "--rows"):
@ -103,11 +98,13 @@ def main():
            units = a
        elif o in ("-s", "--square_size"):
            square_size = float(a)
+        elif o in ("-R", "--radius_rate"):
+            radius_rate = float(a)
        elif o in ("-w", "--page_width"):
            page_width = float(a)
        elif o in ("-h", "--page_height"):
            page_height = float(a)
-    pm = PatternMaker(columns,rows,output,units,square_size,page_width,page_height)
+    pm = PatternMaker(columns,rows,output,units,square_size,radius_rate,page_width,page_height)
    #dict for easy lookup of pattern type
    mp = {"circles":pm.makeCirclesPattern,"acircles":pm.makeACirclesPattern,"checkerboard":pm.makeCheckerboardPattern}
    mp[p_type]()
--- a/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown
+++ b/doc/py_tutorials/py_calib3d/py_depthmap/py_depthmap.markdown
@ -44,7 +44,7 @@ from matplotlib import pyplot as plt
 imgL = cv2.imread('tsukuba_l.png',0)
 imgR = cv2.imread('tsukuba_r.png',0)

-stereo = cv2.createStereoBM(numDisparities=16, blockSize=15)
+stereo = cv2.StereoBM_create(numDisparities=16, blockSize=15)
 disparity = stereo.compute(imgL,imgR)
 plt.imshow(disparity,'gray')
 plt.show()
--- a/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
+++ b/doc/tutorials/calib3d/camera_calibration/camera_calibration.markdown
@ -30,7 +30,7 @@ y_{corrected} = y + [ p_1(r^2+ 2y^2)+ 2p_2xy]\f]
 So we have five distortion parameters which in OpenCV are presented as one row matrix with 5
 columns:

-\f[Distortion_{coefficients}=(k_1 \hspace{10pt} k_2 \hspace{10pt} p_1 \hspace{10pt} p_2 \hspace{10pt} k_3)\f]
+\f[distortion\_coefficients=(k_1 \hspace{10pt} k_2 \hspace{10pt} p_1 \hspace{10pt} p_2 \hspace{10pt} k_3)\f]

 Now for the unit conversion we use the following formula:

@ -96,83 +96,30 @@ on how to do this you can find in the @ref tutorial_file_input_output_with_xml_y
 Explanation
 -----------

-#  **Read the settings.**
-    @code{.cpp}
-    Settings s;
-    const string inputSettingsFile = argc > 1 ? argv[1] : "default.xml";
-    FileStorage fs(inputSettingsFile, FileStorage::READ); // Read the settings
-    if (!fs.isOpened())
-    {
-          cout << "Could not open the configuration file: \"" << inputSettingsFile << "\"" << endl;
-          return -1;
-    }
-    fs["Settings"] >> s;
-    fs.release();                                         // close Settings file
+-#  **Read the settings**
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp file_read

-    if (!s.goodInput)
-    {
-          cout << "Invalid input detected. Application stopping. " << endl;
-          return -1;
-    }
-    @endcode
    For this I've used simple OpenCV class input operation. After reading the file I've an
    additional post-processing function that checks validity of the input. Only if all inputs are
    good then *goodInput* variable will be true.

-#  **Get next input, if it fails or we have enough of them - calibrate**. After this we have a big
+-#  **Get next input, if it fails or we have enough of them - calibrate**
+
+    After this we have a big
    loop where we do the following operations: get the next image from the image list, camera or
    video file. If this fails or we have enough images then we run the calibration process. In case
    of image we step out of the loop and otherwise the remaining frames will be undistorted (if the
    option is set) via changing from *DETECTION* mode to the *CALIBRATED* one.
-    @code{.cpp}
-    for(int i = 0;;++i)
-    {
-      Mat view;
-      bool blinkOutput = false;
-
-      view = s.nextImage();
-
-      //-----  If no more image, or got enough, then stop calibration and show result -------------
-      if( mode == CAPTURING && imagePoints.size() >= (unsigned)s.nrFrames )
-      {
-            if( runCalibrationAndSave(s, imageSize,  cameraMatrix, distCoeffs, imagePoints))
-                  mode = CALIBRATED;
-            else
-                  mode = DETECTION;
-      }
-      if(view.empty())          // If no more images then run calibration, save and stop loop.
-      {
-                if( imagePoints.size() > 0 )
-                      runCalibrationAndSave(s, imageSize,  cameraMatrix, distCoeffs, imagePoints);
-                break;
-      imageSize = view.size();  // Format input image.
-      if( s.flipVertical )    flip( view, view, 0 );
-      }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp get_input
    For some cameras we may need to flip the input image. Here we do this too.

-#  **Find the pattern in the current input**. The formation of the equations I mentioned above aims
+-#  **Find the pattern in the current input**
+
+    The formation of the equations I mentioned above aims
    to finding major patterns in the input: in case of the chessboard this are corners of the
    squares and for the circles, well, the circles themselves. The position of these will form the
    result which will be written into the *pointBuf* vector.
-    @code{.cpp}
-    vector<Point2f> pointBuf;
-
-    bool found;
-    switch( s.calibrationPattern ) // Find feature points on the input format
-    {
-    case Settings::CHESSBOARD:
-      found = findChessboardCorners( view, s.boardSize, pointBuf,
-      CALIB_CB_ADAPTIVE_THRESH | CALIB_CB_FAST_CHECK | CALIB_CB_NORMALIZE_IMAGE);
-      break;
-    case Settings::CIRCLES_GRID:
-      found = findCirclesGrid( view, s.boardSize, pointBuf );
-      break;
-    case Settings::ASYMMETRIC_CIRCLES_GRID:
-      found = findCirclesGrid( view, s.boardSize, pointBuf, CALIB_CB_ASYMMETRIC_GRID );
-      break;
-    }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp find_pattern
    Depending on the type of the input pattern you use either the @ref cv::findChessboardCorners or
    the @ref cv::findCirclesGrid function. For both of them you pass the current image and the size
    of the board and you'll get the positions of the patterns. Furthermore, they return a boolean
@ -188,109 +135,27 @@ Explanation
    *imagePoints* vector to collect all of the equations into a single container. Finally, for
    visualization feedback purposes we will draw the found points on the input image using @ref
    cv::findChessboardCorners function.
-    @code{.cpp}
-    if ( found)                // If done with success,
-      {
-          // improve the found corners' coordinate accuracy for chessboard
-            if( s.calibrationPattern == Settings::CHESSBOARD)
-            {
-                Mat viewGray;
-                cvtColor(view, viewGray, COLOR_BGR2GRAY);
-                cornerSubPix( viewGray, pointBuf, Size(11,11),
-                  Size(-1,-1), TermCriteria( TermCriteria::EPS+TermCriteria::MAX_ITER, 30, 0.1 ));
-            }
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp pattern_found
+-#  **Show state and result to the user, plus command line control of the application**

-            if( mode == CAPTURING &&  // For camera only take new samples after delay time
-                (!s.inputCapture.isOpened() || clock() - prevTimestamp > s.delay*1e-3*CLOCKS_PER_SEC) )
-            {
-                imagePoints.push_back(pointBuf);
-                prevTimestamp = clock();
-                blinkOutput = s.inputCapture.isOpened();
-            }
-
-            // Draw the corners.
-            drawChessboardCorners( view, s.boardSize, Mat(pointBuf), found );
-      }
-    @endcode
-#  **Show state and result to the user, plus command line control of the application**. This part
-    shows text output on the image.
-    @code{.cpp}
-    //----------------------------- Output Text ------------------------------------------------
-    string msg = (mode == CAPTURING) ? "100/100" :
-              mode == CALIBRATED ? "Calibrated" : "Press 'g' to start";
-    int baseLine = 0;
-    Size textSize = getTextSize(msg, 1, 1, 1, &baseLine);
-    Point textOrigin(view.cols - 2*textSize.width - 10, view.rows - 2*baseLine - 10);
-
-    if( mode == CAPTURING )
-    {
-      if(s.showUndistorsed)
-        msg = format( "%d/%d Undist", (int)imagePoints.size(), s.nrFrames );
-      else
-        msg = format( "%d/%d", (int)imagePoints.size(), s.nrFrames );
-    }
-
-    putText( view, msg, textOrigin, 1, 1, mode == CALIBRATED ?  GREEN : RED);
-
-    if( blinkOutput )
-       bitwise_not(view, view);
-    @endcode
+    This part shows text output on the image.
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp output_text
    If we ran calibration and got camera's matrix with the distortion coefficients we may want to
    correct the image using @ref cv::undistort function:
-    @code{.cpp}
-    //------------------------- Video capture  output  undistorted ------------------------------
-    if( mode == CALIBRATED && s.showUndistorsed )
-    {
-      Mat temp = view.clone();
-      undistort(temp, view, cameraMatrix, distCoeffs);
-    }
-    //------------------------------ Show image and check for input commands -------------------
-    imshow("Image View", view);
-    @endcode
-    Then we wait for an input key and if this is *u* we toggle the distortion removal, if it is *g*
-    we start again the detection process, and finally for the *ESC* key we quit the application:
-    @code{.cpp}
-    char key =  waitKey(s.inputCapture.isOpened() ? 50 : s.delay);
-    if( key  == ESC_KEY )
-          break;
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp output_undistorted
+    Then we show the image and wait for an input key and if this is *u* we toggle the distortion removal,
+    if it is *g* we start again the detection process, and finally for the *ESC* key we quit the application:
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp await_input
+-#  **Show the distortion removal for the images too**

-    if( key == 'u' && mode == CALIBRATED )
-       s.showUndistorsed = !s.showUndistorsed;
-
-    if( s.inputCapture.isOpened() && key == 'g' )
-    {
-      mode = CAPTURING;
-      imagePoints.clear();
-    }
-    @endcode
-#  **Show the distortion removal for the images too**. When you work with an image list it is not
+    When you work with an image list it is not
    possible to remove the distortion inside the loop. Therefore, you must do this after the loop.
    Taking advantage of this now I'll expand the @ref cv::undistort function, which is in fact first
    calls @ref cv::initUndistortRectifyMap to find transformation matrices and then performs
    transformation using @ref cv::remap function. Because, after successful calibration map
    calculation needs to be done only once, by using this expanded form you may speed up your
    application:
-    @code{.cpp}
-    if( s.inputType == Settings::IMAGE_LIST && s.showUndistorsed )
-    {
-      Mat view, rview, map1, map2;
-      initUndistortRectifyMap(cameraMatrix, distCoeffs, Mat(),
-          getOptimalNewCameraMatrix(cameraMatrix, distCoeffs, imageSize, 1, imageSize, 0),
-          imageSize, CV_16SC2, map1, map2);
-
-      for(int i = 0; i < (int)s.imageList.size(); i++ )
-      {
-          view = imread(s.imageList[i], 1);
-          if(view.empty())
-              continue;
-          remap(view, rview, map1, map2, INTER_LINEAR);
-          imshow("Image View", rview);
-          char c = waitKey();
-          if( c  == ESC_KEY || c == 'q' || c == 'Q' )
-              break;
-      }
-    }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp show_results

 The calibration and save
 ------------------------
@ -304,24 +169,7 @@ Therefore in the first function we just split up these two processes. Because we
 of the calibration variables we'll create these variables here and pass on both of them to the
 calibration and saving function. Again, I'll not show the saving part as that has little in common
 with the calibration. Explore the source file in order to find out how and what:
-@code{.cpp}
-bool runCalibrationAndSave(Settings& s, Size imageSize, Mat&  cameraMatrix, Mat& distCoeffs,vector<vector<Point2f> > imagePoints )
-{
- vector<Mat> rvecs, tvecs;
- vector<float> reprojErrs;
- double totalAvgErr = 0;
-
- bool ok = runCalibration(s,imageSize, cameraMatrix, distCoeffs, imagePoints, rvecs, tvecs,
-                          reprojErrs, totalAvgErr);
- cout << (ok ? "Calibration succeeded" : "Calibration failed")
-     << ". avg re projection error = "  << totalAvgErr ;
-
- if( ok )   // save only if the calibration was done with success
-     saveCameraParams( s, imageSize, cameraMatrix, distCoeffs, rvecs ,tvecs, reprojErrs,
-                         imagePoints, totalAvgErr);
- return ok;
-}
-@endcode
+@snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp run_and_save
 We do the calibration with the help of the @ref cv::calibrateCamera function. It has the following
 parameters:

@ -331,29 +179,7 @@ parameters:
    present. Because, we use a single pattern for all the input images we can calculate this just
    once and multiply it for all the other input views. We calculate the corner points with the
    *calcBoardCornerPositions* function as:
-    @code{.cpp}
-    void calcBoardCornerPositions(Size boardSize, float squareSize, vector<Point3f>& corners,
-                      Settings::Pattern patternType /*= Settings::CHESSBOARD*/)
-    {
-    corners.clear();
-
-    switch(patternType)
-    {
-    case Settings::CHESSBOARD:
-    case Settings::CIRCLES_GRID:
-      for( int i = 0; i < boardSize.height; ++i )
-        for( int j = 0; j < boardSize.width; ++j )
-            corners.push_back(Point3f(float( j*squareSize ), float( i*squareSize ), 0));
-      break;
-
-    case Settings::ASYMMETRIC_CIRCLES_GRID:
-      for( int i = 0; i < boardSize.height; i++ )
-         for( int j = 0; j < boardSize.width; j++ )
-            corners.push_back(Point3f(float((2*j + i % 2)*squareSize), float(i*squareSize), 0));
-      break;
-    }
-    }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp board_corners
    And then multiply it as:
    @code{.cpp}
    vector<vector<Point3f> > objectPoints(1);
@ -365,12 +191,8 @@ parameters:
    circle pattern). We have already collected this from @ref cv::findChessboardCorners or @ref
    cv::findCirclesGrid function. We just need to pass it on.
 -   The size of the image acquired from the camera, video file or the images.
-   The camera matrix. If we used the fixed aspect ratio option we need to set the \f$f_x\f$ to zero:
-    @code{.cpp}
-    cameraMatrix = Mat::eye(3, 3, CV_64F);
-    if( s.flag & CALIB_FIX_ASPECT_RATIO )
-         cameraMatrix.at<double>(0,0) = 1.0;
-    @endcode
+-   The camera matrix. If we used the fixed aspect ratio option we need to set \f$f_x\f$:
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp fixed_aspect
 -   The distortion coefficient matrix. Initialize with zero.
    @code{.cpp}
    distCoeffs = Mat::zeros(8, 1, CV_64F);
@ -393,33 +215,7 @@ double rms = calibrateCamera(objectPoints, imagePoints, imageSize, cameraMatrix,
    calculate the absolute norm between what we got with our transformation and the corner/circle
    finding algorithm. To find the average error we calculate the arithmetical mean of the errors
    calculated for all the calibration images.
-    @code{.cpp}
-    double computeReprojectionErrors( const vector<vector<Point3f> >& objectPoints,
-                              const vector<vector<Point2f> >& imagePoints,
-                              const vector<Mat>& rvecs, const vector<Mat>& tvecs,
-                              const Mat& cameraMatrix , const Mat& distCoeffs,
-                              vector<float>& perViewErrors)
-    {
-    vector<Point2f> imagePoints2;
-    int i, totalPoints = 0;
-    double totalErr = 0, err;
-    perViewErrors.resize(objectPoints.size());
-
-    for( i = 0; i < (int)objectPoints.size(); ++i )
-    {
-      projectPoints( Mat(objectPoints[i]), rvecs[i], tvecs[i], cameraMatrix,  // project
-                                           distCoeffs, imagePoints2);
-      err = norm(Mat(imagePoints[i]), Mat(imagePoints2), NORM_L2);              // difference
-
-      int n = (int)objectPoints[i].size();
-      perViewErrors[i] = (float) std::sqrt(err*err/n);                        // save for this view
-      totalErr        += err*err;                                             // sum it up
-      totalPoints     += n;
-    }
-
-    return std::sqrt(totalErr/totalPoints);              // calculate the arithmetical mean
-    }
-    @endcode
+    @snippet samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp compute_errors

 Results
 -------
@ -461,20 +257,20 @@ the input. Here's, how a detected pattern should look:
 In both cases in the specified output XML/YAML file you'll find the camera and distortion
 coefficients matrices:
@code{.xml}
-<Camera_Matrix type_id="opencv-matrix">
+<camera_matrix type_id="opencv-matrix">
 <rows>3</rows>
 <cols>3</cols>
 <dt>d</dt>
 <data>
 6.5746697944293521e+002 0. 3.1950000000000000e+002 0.
- 6.5746697944293521e+002 2.3950000000000000e+002 0. 0. 1.</data></Camera_Matrix>
-<Distortion_Coefficients type_id="opencv-matrix">
+ 6.5746697944293521e+002 2.3950000000000000e+002 0. 0. 1.</data></camera_matrix>
+<distortion_coefficients type_id="opencv-matrix">
 <rows>5</rows>
 <cols>1</cols>
 <dt>d</dt>
 <data>
 -4.1802327176423804e-001 5.0715244063187526e-001 0. 0.
- -5.7843597214487474e-001</data></Distortion_Coefficients>
+ -5.7843597214487474e-001</data></distortion_coefficients>
@endcode
 Add these values as constants to your program, call the @ref cv::initUndistortRectifyMap and the
@ref cv::remap function to remove distortion and enjoy distortion free inputs for cheap and low
--- a/doc/tutorials/core/adding_images/adding_images.markdown
+++ b/doc/tutorials/core/adding_images/adding_images.markdown
@ -22,7 +22,7 @@ From our previous tutorial, we know already a bit of *Pixel operators*. An inter
 \f[g(x) = (1 - \alpha)f_{0}(x) + \alpha f_{1}(x)\f]

 By varying \f$\alpha\f$ from \f$0 \rightarrow 1\f$ this operator can be used to perform a temporal
-*cross-disolve* between two images or videos, as seen in slide shows and film productions (cool,
+*cross-dissolve* between two images or videos, as seen in slide shows and film productions (cool,
 eh?)

 Code
--- a/doc/tutorials/core/basic_geometric_drawing/basic_geometric_drawing.markdown
+++ b/doc/tutorials/core/basic_geometric_drawing/basic_geometric_drawing.markdown
@ -145,7 +145,7 @@ Explanation
            of size **(w/4.0, w/16.0)**
        -   The ellipse is rotated **angle** degrees
        -   The ellipse extends an arc between **0** and **360** degrees
-        -   The color of the figure will be **Scalar( 255, 255, 0)** which means blue in RGB value.
+        -   The color of the figure will be **Scalar( 255, 0, 0)** which means blue in RGB value.
        -   The ellipse's **thickness** is 2.
    -   *MyFilledCircle*
        @code{.cpp}
--- a/doc/tutorials/core/random_generator_and_text/random_generator_and_text.markdown
+++ b/doc/tutorials/core/random_generator_and_text/random_generator_and_text.markdown
@ -111,7 +111,7 @@ Explanation
        pt1.y = rng.uniform( y_1, y_2 );
        @endcode
        -   We know that **rng** is a *Random number generator* object. In the code above we are
-            calling **rng.uniform(a,b)**. This generates a radombly uniformed distribution between
+            calling **rng.uniform(a,b)**. This generates a randomly uniformed distribution between
            the values **a** and **b** (inclusive in **a**, exclusive in **b**).
        -   From the explanation above, we deduce that the extremes *pt1* and *pt2* will be random
            values, so the lines positions will be quite impredictable, giving a nice visual effect
@ -133,7 +133,7 @@ Explanation
            are used as the *R*, *G* and *B* parameters for the line color. Hence, the color of the
            lines will be random too!

-#  The explanation above applies for the other functions generating circles, ellipses, polygones,
+-#  The explanation above applies for the other functions generating circles, ellipses, polygons,
    etc. The parameters such as *center* and *vertices* are also generated randomly.
 -#  Before finishing, we also should take a look at the functions *Display_Random_Text* and
    *Displaying_Big_End*, since they both have a few interesting features:
--- a/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
+++ b/doc/tutorials/imgproc/imgtrans/hough_lines/hough_lines.markdown
@ -55,7 +55,7 @@ Arranging the terms: \f$r = x \cos \theta + y \sin \theta\f$
 -#  We can do the same operation above for all the points in an image. If the curves of two
    different points intersect in the plane \f$\theta\f$ - \f$r\f$, that means that both points belong to a
    same line. For instance, following with the example above and drawing the plot for two more
-    points: \f$x_{1} = 9\f$, \f$y_{1} = 4\f$ and \f$x_{2} = 12\f$, \f$y_{2} = 3\f$, we get:
+    points: \f$x_{1} = 4\f$, \f$y_{1} = 9\f$ and \f$x_{2} = 12\f$, \f$y_{2} = 3\f$, we get:

    ![](images/Hough_Lines_Tutorial_Theory_2.jpg)

--- a/doc/tutorials/introduction/biicode/tutorial_biicode.markdown
+++ b/doc/tutorials/introduction/biicode/tutorial_biicode.markdown
@ -41,7 +41,7 @@ Windows users also execute:
 $ bii cpp:configure -G "Visual Studio 12"
@endcode

-Now execute ``bii cpp:build`` to build the project. **Note** that this can take a while, until it downloads and builds OpenCV. However, this is downloaded just once in your machine in your "user/.biicode" folder. If the OpenCV installation process fails, you might simply go there, delete OpenCV files inside "user/.biicode" and repeat.
+Now execute ``bii cpp:build`` to build the project. @note This can take a while, until it downloads and builds OpenCV. However, this is downloaded just once in your machine to your "user/.biicode" folder. If the OpenCV installation process fails, you might simply go there, delete OpenCV files inside "user/.biicode" and repeat.

@code{.bash}
 $ bii cpp:build
@ -137,7 +137,7 @@ replace with:
        diego/opencv(beta): 0
@endcode

-**Note** that the first time you switch to 3.0-beta, it will also take a while to download and build the 3.0-beta release. From that point you can change back and forth between versions, just modifying your *biicode.conf requirements*.
+@note The first time you switch to 3.0-beta, it will also take a while to download and build the 3.0-beta release. From that point on you can change back and forth between versions just by modifying your *biicode.conf requirements*.

 Find the hooks and examples:
 * [OpenCV 2.4.10](http://www.biicode.com/diego/opencv)
--- a/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
+++ b/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
@ -53,9 +53,9 @@ Now you have to create your CMakeLists.txt file. It should look like this:
 cmake_minimum_required(VERSION 2.8)
 project( DisplayImage )
 find_package( OpenCV REQUIRED )
-include_directories( \f${OpenCV_INCLUDE_DIRS} )
+include_directories( ${OpenCV_INCLUDE_DIRS} )
 add_executable( DisplayImage DisplayImage.cpp )
-target_link_libraries( DisplayImage \f${OpenCV_LIBS} )
+target_link_libraries( DisplayImage ${OpenCV_LIBS} )
@endcode
 ### Generate the executable

--- a/doc/user_guide/ug_traincascade.markdown
+++ b/doc/user_guide/ug_traincascade.markdown
@ -256,6 +256,12 @@ Command line arguments of opencv_traincascade application grouped by purposes:
        Maximum number of threads to use during training. Notice that the actual number of used
        threads may be lower, depending on your machine and compilation options.

+    -   -acceptanceRatioBreakValue \<break_value\>
+
+        This argument is used to determine how precise your model should keep learning and when to stop.
+        A good guideline is to train not further than 10e-5, to ensure the model does not overtrain on your training data.
+        By default this value is set to -1 to disable this feature.
+
 -#  Cascade parameters:

    -   -stageType \<BOOST(default)\>
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@ -697,19 +697,19 @@ CV_EXPORTS_W bool findCirclesGrid( InputArray image, Size patternSize,

 /** @brief Finds the camera intrinsic and extrinsic parameters from several views of a calibration pattern.

-@param objectPoints In the new interface it is a vector of vectors of calibration pattern points
-in the calibration pattern coordinate space. The outer vector contains as many elements as the
-number of the pattern views. If the same calibration pattern is shown in each view and it is fully
-visible, all the vectors will be the same. Although, it is possible to use partially occluded
-patterns, or even different patterns in different views. Then, the vectors will be different. The
-points are 3D, but since they are in a pattern coordinate system, then, if the rig is planar, it
-may make sense to put the model to a XY coordinate plane so that Z-coordinate of each input object
-point is 0.
+@param objectPoints In the new interface it is a vector of vectors of calibration pattern points in
+the calibration pattern coordinate space (e.g. std::vector<std::vector<cv::Vec3f>>). The outer
+vector contains as many elements as the number of the pattern views. If the same calibration pattern
+is shown in each view and it is fully visible, all the vectors will be the same. Although, it is
+possible to use partially occluded patterns, or even different patterns in different views. Then,
+the vectors will be different. The points are 3D, but since they are in a pattern coordinate system,
+then, if the rig is planar, it may make sense to put the model to a XY coordinate plane so that
+Z-coordinate of each input object point is 0.
 In the old interface all the vectors of object points from different views are concatenated
 together.
-@param imagePoints In the new interface it is a vector of vectors of the projections of
-calibration pattern points. imagePoints.size() and objectPoints.size() and imagePoints[i].size()
-must be equal to objectPoints[i].size() for each i.
+@param imagePoints In the new interface it is a vector of vectors of the projections of calibration
+pattern points (e.g. std::vector<std::vector<cv::Vec2f>>). imagePoints.size() and
+objectPoints.size() and imagePoints[i].size() must be equal to objectPoints[i].size() for each i.
 In the old interface all the vectors of object points from different views are concatenated
 together.
@param imageSize Size of the image used only to initialize the intrinsic camera matrix.
@ -719,11 +719,11 @@ and/or CV_CALIB_FIX_ASPECT_RATIO are specified, some or all of fx, fy, cx, cy mu
 initialized before calling the function.
@param distCoeffs Output vector of distortion coefficients
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6],[s_1, s_2, s_3, s_4]])\f$ of 4, 5, 8 or 12 elements.
-@param rvecs Output vector of rotation vectors (see Rodrigues ) estimated for each pattern view.
-That is, each k-th rotation vector together with the corresponding k-th translation vector (see
-the next output parameter description) brings the calibration pattern from the model coordinate
-space (in which object points are specified) to the world coordinate space, that is, a real
-position of the calibration pattern in the k-th pattern view (k=0.. *M* -1).
+@param rvecs Output vector of rotation vectors (see Rodrigues ) estimated for each pattern view
+(e.g. std::vector<cv::Mat>>). That is, each k-th rotation vector together with the corresponding
+k-th translation vector (see the next output parameter description) brings the calibration pattern
+from the model coordinate space (in which object points are specified) to the world coordinate
+space, that is, a real position of the calibration pattern in the k-th pattern view (k=0.. *M* -1).
@param tvecs Output vector of translation vectors estimated for each pattern view.
@param flags Different flags that may be zero or a combination of the following values:
 -   **CV_CALIB_USE_INTRINSIC_GUESS** cameraMatrix contains valid initial values of
@ -1200,7 +1200,7 @@ for the other points. The array is computed only in the RANSAC and LMedS methods
 This function estimates essential matrix based on the five-point algorithm solver in @cite Nister03 .
@cite SteweniusCFS is also a related. The epipolar geometry is described by the following equation:

-\f[[p_2; 1]^T K^T E K [p_1; 1] = 0 \\\f]\f[K =
+\f[[p_2; 1]^T K^{-T} E K^{-1} [p_1; 1] = 0 \\\f]\f[K =
 \begin{bmatrix}
 f & 0 & x_{pp}  \\
 0 & f & y_{pp}  \\
--- a/modules/calib3d/src/fundam.cpp
+++ b/modules/calib3d/src/fundam.cpp
@ -641,7 +641,7 @@ static int run8Point( const Mat& _m1, const Mat& _m2, Mat& _fmatrix )
    W.at<double>(2) = 0.;

    // F0 <- U*diag([W(1), W(2), 0])*V'
-    gemm( U, Mat::diag(W), 1., 0, 0., TF, GEMM_1_T );
+    gemm( U, Mat::diag(W), 1., 0, 0., TF, 0 );
    gemm( TF, V, 1., 0, 0., F0, 0/*CV_GEMM_B_T*/ );

    // apply the transformation that is inverse
--- a/modules/calib3d/src/triangulate.cpp
+++ b/modules/calib3d/src/triangulate.cpp
@ -63,8 +63,7 @@ cvTriangulatePoints(CvMat* projMatr1, CvMat* projMatr2, CvMat* projPoints1, CvMa
      !CV_IS_MAT(points4D) )
      CV_Error( CV_StsUnsupportedFormat, "Input parameters must be matrices" );

-    int numPoints;
-    numPoints = projPoints1->cols;
+    int numPoints = projPoints1->cols;

    if( numPoints < 1 )
        CV_Error( CV_StsOutOfRange, "Number of points must be more than zero" );
@ -82,57 +81,39 @@ cvTriangulatePoints(CvMat* projMatr1, CvMat* projMatr2, CvMat* projPoints1, CvMa
       projMatr2->cols != 4 || projMatr2->rows != 3)
        CV_Error( CV_StsUnmatchedSizes, "Size of projection matrices must be 3x4" );

-    CvMat matrA;
-    double matrA_dat[24];
-    matrA = cvMat(6,4,CV_64F,matrA_dat);
+    // preallocate SVD matrices on stack
+    cv::Matx<double, 6, 4> matrA;
+    cv::Matx<double, 6, 4> matrU;
+    cv::Matx<double, 4, 1> matrW;
+    cv::Matx<double, 4, 4> matrV;

-    //CvMat matrU;
-    CvMat matrW;
-    CvMat matrV;
-    //double matrU_dat[9*9];
-    double matrW_dat[6*4];
-    double matrV_dat[4*4];
-
-    //matrU = cvMat(6,6,CV_64F,matrU_dat);
-    matrW = cvMat(6,4,CV_64F,matrW_dat);
-    matrV = cvMat(4,4,CV_64F,matrV_dat);
-
-    CvMat* projPoints[2];
-    CvMat* projMatrs[2];
-
-    projPoints[0] = projPoints1;
-    projPoints[1] = projPoints2;
-
-    projMatrs[0] = projMatr1;
-    projMatrs[1] = projMatr2;
+    CvMat* projPoints[2] = {projPoints1, projPoints2};
+    CvMat* projMatrs[2] = {projMatr1, projMatr2};

    /* Solve system for each point */
-    int i,j;
-    for( i = 0; i < numPoints; i++ )/* For each point */
+    for( int i = 0; i < numPoints; i++ )/* For each point */
    {
        /* Fill matrix for current point */
-        for( j = 0; j < 2; j++ )/* For each view */
+        for( int j = 0; j < 2; j++ )/* For each view */
        {
            double x,y;
            x = cvmGet(projPoints[j],0,i);
            y = cvmGet(projPoints[j],1,i);
            for( int k = 0; k < 4; k++ )
            {
-                cvmSet(&matrA, j*3+0, k, x * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],0,k) );
-                cvmSet(&matrA, j*3+1, k, y * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],1,k) );
-                cvmSet(&matrA, j*3+2, k, x * cvmGet(projMatrs[j],1,k) - y * cvmGet(projMatrs[j],0,k) );
+                matrA(j*3+0, k) = x * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],0,k);
+                matrA(j*3+1, k) = y * cvmGet(projMatrs[j],2,k) -     cvmGet(projMatrs[j],1,k);
+                matrA(j*3+2, k) = x * cvmGet(projMatrs[j],1,k) - y * cvmGet(projMatrs[j],0,k);
            }
        }
        /* Solve system for current point */
-        {
-            cvSVD(&matrA,&matrW,0,&matrV,CV_SVD_V_T);
+        cv::SVD::compute(matrA, matrW, matrU, matrV);

-            /* Copy computed point */
-            cvmSet(points4D,0,i,cvmGet(&matrV,3,0));/* X */
-            cvmSet(points4D,1,i,cvmGet(&matrV,3,1));/* Y */
-            cvmSet(points4D,2,i,cvmGet(&matrV,3,2));/* Z */
-            cvmSet(points4D,3,i,cvmGet(&matrV,3,3));/* W */
-        }
+        /* Copy computed point */
+        cvmSet(points4D,0,i,matrV(3,0));/* X */
+        cvmSet(points4D,1,i,matrV(3,1));/* Y */
+        cvmSet(points4D,2,i,matrV(3,2));/* Z */
+        cvmSet(points4D,3,i,matrV(3,3));/* W */
    }

 #if 0
--- a/modules/calib3d/test/test_fisheye.cpp
+++ b/modules/calib3d/test/test_fisheye.cpp
@ -381,7 +381,7 @@ TEST_F(fisheyeTest, EtimateUncertainties)
    EXPECT_MAT_NEAR(errors.c, cv::Vec2d(0.890439368129246, 0.816096854937896), 1e-10);
    EXPECT_MAT_NEAR(errors.k, cv::Vec4d(0.00516248605191506, 0.0168181467500934, 0.0213118690274604, 0.00916010877545648), 1e-10);
    EXPECT_MAT_NEAR(err_std, cv::Vec2d(0.187475975266883, 0.185678953263995), 1e-10);
-    CV_Assert(abs(rms - 0.263782587133546) < 1e-10);
+    CV_Assert(fabs(rms - 0.263782587133546) < 1e-10);
    CV_Assert(errors.alpha == 0);
 }

--- a/modules/calib3d/test/test_fundam.cpp
+++ b/modules/calib3d/test/test_fundam.cpp
@ -973,26 +973,12 @@ int CV_FundamentalMatTest::prepare_test_case( int test_case_idx )
    return code;
 }

-
 void CV_FundamentalMatTest::run_func()
 {
-    //if(!test_cpp)
-    {
-        CvMat _input0 = test_mat[INPUT][0], _input1 = test_mat[INPUT][1];
-        CvMat F = test_mat[TEMP][0], mask = test_mat[TEMP][1];
-        f_result = cvFindFundamentalMat( &_input0, &_input1, &F, method, MAX(sigma*3, 0.01), 0, &mask );
-    }
-    /*else
-    {
-        cv::findFundamentalMat(const Mat& points1, const Mat& points2,
-        vector<uchar>& mask, int method=FM_RANSAC,
-        double param1=3., double param2=0.99 );
-
-        CV_EXPORTS Mat findFundamentalMat( const Mat& points1, const Mat& points2,
-                                          int method=FM_RANSAC,
-                                          double param1=3., double param2=0.99 );
-    }*/
-
+    // cvFindFundamentalMat calls cv::findFundamentalMat
+    CvMat _input0 = test_mat[INPUT][0], _input1 = test_mat[INPUT][1];
+    CvMat F = test_mat[TEMP][0], mask = test_mat[TEMP][1];
+    f_result = cvFindFundamentalMat( &_input0, &_input1, &F, method, MAX(sigma*3, 0.01), 0, &mask );
 }


@ -1022,7 +1008,7 @@ void CV_FundamentalMatTest::prepare_to_validation( int test_case_idx )
    F0 *= 1./f0[8];

    uchar* status = test_mat[TEMP][1].ptr();
-    double err_level = method <= CV_FM_8POINT ? 1 : get_success_error_level( test_case_idx, OUTPUT, 1 );
+    double err_level = get_success_error_level( test_case_idx, OUTPUT, 1 );
    uchar* mtfm1 = test_mat[REF_OUTPUT][1].ptr();
    uchar* mtfm2 = test_mat[OUTPUT][1].ptr();
    double* f_prop1 = test_mat[REF_OUTPUT][0].ptr<double>();
--- a/modules/calib3d/test/test_reproject_image_to_3d.cpp
+++ b/modules/calib3d/test/test_reproject_image_to_3d.cpp
@ -138,7 +138,12 @@ protected:
            {
                InT d = disp(y, x);

-                double from[4] = { x, y, d, 1 };
+                double from[4] = {
+                    static_cast<double>(x),
+                    static_cast<double>(y),
+                    static_cast<double>(d),
+                    1.0,
+                };
                Mat_<double> res = Q * Mat_<double>(4, 1, from);
                res /= res(3, 0);

--- a/modules/calib3d/test/test_solvepnp_ransac.cpp
+++ b/modules/calib3d/test/test_solvepnp_ransac.cpp
@ -183,6 +183,9 @@ protected:
                        method, totalTestsCount - successfulTestsCount, totalTestsCount, maxError, mode);
                    ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
                }
+                cout << "mode: " << mode << ", method: " << method << " -> "
+                     << ((double)successfulTestsCount / totalTestsCount) * 100 << "%"
+                     << " (err < " << maxError << ")" << endl;
            }
        }
    }
--- a/modules/calib3d/test/test_undistort_badarg.cpp
+++ b/modules/calib3d/test/test_undistort_badarg.cpp
@ -104,7 +104,10 @@ void CV_UndistortPointsBadArgTest::run(int)
    img_size.height = 600;
    double cam[9] = {150.f, 0.f, img_size.width/2.f, 0, 300.f, img_size.height/2.f, 0.f, 0.f, 1.f};
    double dist[4] = {0.01,0.02,0.001,0.0005};
-    double s_points[N_POINTS2] = {img_size.width/4,img_size.height/4};
+    double s_points[N_POINTS2] = {
+        static_cast<double>(img_size.width) / 4.0,
+        static_cast<double>(img_size.height) / 4.0,
+    };
    double d_points[N_POINTS2];
    double p[9] = {155.f, 0.f, img_size.width/2.f+img_size.width/50.f, 0, 310.f, img_size.height/2.f+img_size.height/50.f, 0.f, 0.f, 1.f};
    double r[9] = {1,0,0,0,1,0,0,0,1};
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -10,8 +10,10 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2015, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -2449,9 +2451,7 @@ matrix. The Singular Value Decomposition is used to solve least-square
 problems, under-determined linear systems, invert matrices, compute
 condition numbers, and so on.

-For a faster operation, you can pass flags=SVD::MODIFY_A|... to modify
-the decomposed matrix when it is not necessary to preserve it. If you
-want to compute a condition number of a matrix or an absolute value of
+If you want to compute a condition number of a matrix or an absolute value of
 its determinant, you do not need `u` and `vt`. You can pass
 flags=SVD::NO_UV|... . Another flag SVD::FULL_UV indicates that full-size u
 and vt must be computed, which is not necessary most of the time.
@ -2462,8 +2462,8 @@ class CV_EXPORTS SVD
 {
 public:
    enum Flags {
-        /** use the algorithm to modify the decomposed matrix; it can save space and speed up
-            processing */
+        /** allow the algorithm to modify the decomposed matrix; it can save space and speed up
+            processing. currently ignored. */
        MODIFY_A = 1,
        /** indicates that only a vector of singular values `w` is to be processed, while u and vt
            will be set to empty matrices */
@ -2921,6 +2921,10 @@ public:
    Algorithm();
    virtual ~Algorithm();

+    /** @brief Clears the algorithm state
+    */
+    CV_WRAP virtual void clear() {}
+
    /** @brief Stores algorithm parameters in a file storage
    */
    virtual void write(FileStorage& fs) const { (void)fs; }
@ -2928,6 +2932,75 @@ public:
    /** @brief Reads algorithm parameters from a file storage
    */
    virtual void read(const FileNode& fn) { (void)fn; }
+
+    /** @brief Returns true if the Algorithm is empty (e.g. in the very beginning or after unsuccessful read
+     */
+    virtual bool empty() const { return false; }
+
+    /** @brief Reads algorithm from the file node
+
+     This is static template method of Algorithm. It's usage is following (in the case of SVM):
+     @code
+     Ptr<SVM> svm = Algorithm::read<SVM>(fn);
+     @endcode
+     In order to make this method work, the derived class must overwrite Algorithm::read(const
+     FileNode& fn) and also have static create() method without parameters
+     (or with all the optional parameters)
+     */
+    template<typename _Tp> static Ptr<_Tp> read(const FileNode& fn)
+    {
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** @brief Loads algorithm from the file
+
+     @param filename Name of the file to read.
+     @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+
+     This is static template method of Algorithm. It's usage is following (in the case of SVM):
+     @code
+     Ptr<SVM> svm = Algorithm::load<SVM>("my_svm_model.xml");
+     @endcode
+     In order to make this method work, the derived class must overwrite Algorithm::read(const
+     FileNode& fn).
+     */
+    template<typename _Tp> static Ptr<_Tp> load(const String& filename, const String& objname=String())
+    {
+        FileStorage fs(filename, FileStorage::READ);
+        FileNode fn = objname.empty() ? fs.getFirstTopLevelNode() : fs[objname];
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** @brief Loads algorithm from a String
+
+     @param strModel The string variable containing the model you want to load.
+     @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+
+     This is static template method of Algorithm. It's usage is following (in the case of SVM):
+     @code
+     Ptr<SVM> svm = Algorithm::loadFromString<SVM>(myStringModel);
+     @endcode
+     */
+    template<typename _Tp> static Ptr<_Tp> loadFromString(const String& strModel, const String& objname=String())
+    {
+        FileStorage fs(strModel, FileStorage::READ + FileStorage::MEMORY);
+        FileNode fn = objname.empty() ? fs.getFirstTopLevelNode() : fs[objname];
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** Saves the algorithm to a file.
+     In order to make this method work, the derived class must implement Algorithm::write(FileStorage& fs). */
+    CV_WRAP virtual void save(const String& filename) const;
+
+    /** Returns the algorithm string identifier.
+     This string is used as top level xml/yml node tag when the object is saved to a file or string. */
+    CV_WRAP virtual String getDefaultName() const;
 };

 struct Param {
--- a/modules/core/include/opencv2/core/affine.hpp
+++ b/modules/core/include/opencv2/core/affine.hpp
@ -253,7 +253,7 @@ void cv::Affine3<T>::rotation(const Vec3& _rvec)
        double c = std::cos(theta);
        double s = std::sin(theta);
        double c1 = 1. - c;
-        double itheta = theta ? 1./theta : 0.;
+        double itheta = (theta != 0) ? 1./theta : 0.;

        rx *= itheta; ry *= itheta; rz *= itheta;

--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -53,6 +53,7 @@

 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/cvstd.hpp"
+#include "opencv2/hal.hpp"

 namespace cv
 {
@ -400,140 +401,30 @@ configurations while CV_DbgAssert is only retained in the Debug configuration.
 #  define CV_DbgAssert(expr)
 #endif

-
-/////////////// saturate_cast (used in image & signal processing) ///////////////////
-
-/**
-Template function for accurate conversion from one primitive type to another.
-
-The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
-and others. They perform an efficient and accurate conversion from one primitive type to another
-(see the introduction chapter). saturate in the name means that when the input value v is out of the
-range of the target type, the result is not formed just by taking low bits of the input, but instead
-the value is clipped. For example:
-@code
-    uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
-    short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
-@endcode
-Such clipping is done when the target type is unsigned char , signed char , unsigned short or
-signed short . For 32-bit integers, no clipping is done.
-
-When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
-the floating-point value is first rounded to the nearest integer and then clipped if needed (when
-the target type is 8- or 16-bit).
-
-This operation is used in the simplest or most complex image processing functions in OpenCV.
-
-@param v Function parameter.
-@sa add, subtract, multiply, divide, Mat::convertTo
-*/
-template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
-
-//! @cond IGNORED
-
-template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
-template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
-template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
-
-template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
-template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
-
-template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
-template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
-
-template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
-template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
-template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
-
-template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
-template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
-
-// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
-template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
-template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
-
-//! @endcond
-
-//////////////////////////////// low-level functions ////////////////////////////////
-
-CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-
-CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
-CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
-CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
-CV_EXPORTS float normL1_(const float* a, const float* b, int n);
-CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
-
-CV_EXPORTS void exp(const float* src, float* dst, int n);
-CV_EXPORTS void log(const float* src, float* dst, int n);
-
-CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
-CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
-
-/** @brief Computes the cube root of an argument.
-
-The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
-NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
-single-precision data.
-@param val A function argument.
+/*
+ * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
+ * bit count of A exclusive XOR'ed with B
 */
-CV_EXPORTS_W float cubeRoot(float val);
+struct CV_EXPORTS Hamming
+{
+    enum { normType = NORM_HAMMING };
+    typedef unsigned char ValueType;
+    typedef int ResultType;

-/** @brief Calculates the angle of a 2D vector in degrees.
+    /** this will count the bits in a ^ b
+     */
+    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const;
+};

-The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
-in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
-@param x x-coordinate of the vector.
-@param y y-coordinate of the vector.
- */
-CV_EXPORTS_W float fastAtan2(float y, float x);
+typedef Hamming HammingLUT;

 /////////////////////////////////// inline norms ////////////////////////////////////

+template<typename _Tp> inline _Tp cv_abs(_Tp x) { return std::abs(x); }
+inline int cv_abs(uchar x) { return x; }
+inline int cv_abs(schar x) { return std::abs(x); }
+inline int cv_abs(ushort x) { return x; }
+inline int cv_abs(short x) { return std::abs(x); }

 template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, int n)
@ -563,12 +454,12 @@ _AccTp normL1(const _Tp* a, int n)
 #if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4 )
    {
-        s += (_AccTp)std::abs(a[i]) + (_AccTp)std::abs(a[i+1]) +
-            (_AccTp)std::abs(a[i+2]) + (_AccTp)std::abs(a[i+3]);
+        s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) +
+            (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]);
    }
 #endif
    for( ; i < n; i++ )
-        s += std::abs(a[i]);
+        s += cv_abs(a[i]);
    return s;
 }

@ -577,7 +468,7 @@ _AccTp normInf(const _Tp* a, int n)
 {
    _AccTp s = 0;
    for( int i = 0; i < n; i++ )
-        s = std::max(s, (_AccTp)std::abs(a[i]));
+        s = std::max(s, (_AccTp)cv_abs(a[i]));
    return s;
 }

@ -601,11 +492,10 @@ _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
    return s;
 }

-template<> inline
-float normL2Sqr(const float* a, const float* b, int n)
+inline float normL2Sqr(const float* a, const float* b, int n)
 {
    if( n >= 8 )
-        return normL2Sqr_(a, b, n);
+        return hal::normL2Sqr_(a, b, n);
    float s = 0;
    for( int i = 0; i < n; i++ )
    {
@ -635,11 +525,10 @@ _AccTp normL1(const _Tp* a, const _Tp* b, int n)
    return s;
 }

-template<> inline
-float normL1(const float* a, const float* b, int n)
+inline float normL1(const float* a, const float* b, int n)
 {
    if( n >= 8 )
-        return normL1_(a, b, n);
+        return hal::normL1_(a, b, n);
    float s = 0;
    for( int i = 0; i < n; i++ )
    {
@ -649,10 +538,9 @@ float normL1(const float* a, const float* b, int n)
    return s;
 }

-template<> inline
-int normL1(const uchar* a, const uchar* b, int n)
+inline int normL1(const uchar* a, const uchar* b, int n)
 {
-    return normL1_(a, b, n);
+    return hal::normL1_(a, b, n);
 }

 template<typename _Tp, typename _AccTp> static inline
@ -667,6 +555,23 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
    return s;
 }

+/** @brief Computes the cube root of an argument.
+
+ The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
+ NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
+ single-precision data.
+ @param val A function argument.
+ */
+CV_EXPORTS_W float cubeRoot(float val);
+
+/** @brief Calculates the angle of a 2D vector in degrees.
+
+ The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
+ in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
+ @param x x-coordinate of the vector.
+ @param y y-coordinate of the vector.
+ */
+CV_EXPORTS_W float fastAtan2(float y, float x);

 ////////////////// forward declarations for important OpenCV types //////////////////

--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -56,23 +56,7 @@
 #undef abs
 #undef Complex

-#if defined __ICL
-#  define CV_ICC   __ICL
-#elif defined __ICC
-#  define CV_ICC   __ICC
-#elif defined __ECL
-#  define CV_ICC   __ECL
-#elif defined __ECC
-#  define CV_ICC   __ECC
-#elif defined __INTEL_COMPILER
-#  define CV_ICC   __INTEL_COMPILER
-#endif
-
-#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
-#  define CV_ENABLE_UNROLLED 0
-#else
-#  define CV_ENABLE_UNROLLED 1
-#endif
+#include "opencv2/hal/defs.h"

 #ifdef __OPENCV_BUILD
 #  define DISABLE_OPENCV_24_COMPATIBILITY
@ -86,16 +70,6 @@
 #  define CV_EXPORTS
 #endif

-#ifndef CV_INLINE
-#  if defined __cplusplus
-#    define CV_INLINE static inline
-#  elif defined _MSC_VER
-#    define CV_INLINE __inline
-#  else
-#    define CV_INLINE static
-#  endif
-#endif
-
 #ifndef CV_EXTERN_C
 #  ifdef __cplusplus
 #    define CV_EXTERN_C extern "C"
@ -104,216 +78,6 @@
 #  endif
 #endif

-/* CPU features and intrinsics support */
-#define CV_CPU_NONE             0
-#define CV_CPU_MMX              1
-#define CV_CPU_SSE              2
-#define CV_CPU_SSE2             3
-#define CV_CPU_SSE3             4
-#define CV_CPU_SSSE3            5
-#define CV_CPU_SSE4_1           6
-#define CV_CPU_SSE4_2           7
-#define CV_CPU_POPCNT           8
-
-#define CV_CPU_AVX              10
-#define CV_CPU_AVX2             11
-#define CV_CPU_FMA3             12
-
-#define CV_CPU_AVX_512F         13
-#define CV_CPU_AVX_512BW        14
-#define CV_CPU_AVX_512CD        15
-#define CV_CPU_AVX_512DQ        16
-#define CV_CPU_AVX_512ER        17
-#define CV_CPU_AVX_512IFMA512   18
-#define CV_CPU_AVX_512PF        19
-#define CV_CPU_AVX_512VBMI      20
-#define CV_CPU_AVX_512VL        21
-
-#define CV_CPU_NEON   100
-
-// when adding to this list remember to update the enum in core/utility.cpp
-#define CV_HARDWARE_MAX_FEATURE 255
-
-// do not include SSE/AVX/NEON headers for NVCC compiler
-#ifndef __CUDACC__
-
-#if defined __SSE2__ || defined _M_X64  || (defined _M_IX86_FP && _M_IX86_FP >= 2)
-#  include <emmintrin.h>
-#  define CV_MMX 1
-#  define CV_SSE 1
-#  define CV_SSE2 1
-#  if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <pmmintrin.h>
-#    define CV_SSE3 1
-#  endif
-#  if defined __SSSE3__  || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <tmmintrin.h>
-#    define CV_SSSE3 1
-#  endif
-#  if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <smmintrin.h>
-#    define CV_SSE4_1 1
-#  endif
-#  if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    include <nmmintrin.h>
-#    define CV_SSE4_2 1
-#  endif
-#  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
-#    ifdef _MSC_VER
-#      include <nmmintrin.h>
-#    else
-#      include <popcntintrin.h>
-#    endif
-#    define CV_POPCNT 1
-#  endif
-#  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
-// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
-// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
-#    include <immintrin.h>
-#    define CV_AVX 1
-#    if defined(_XCR_XFEATURE_ENABLED_MASK)
-#      define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
-#    else
-#      define __xgetbv() 0
-#    endif
-#  endif
-#  if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
-#    include <immintrin.h>
-#    define CV_AVX2 1
-#    if defined __FMA__
-#      define CV_FMA3 1
-#    endif
-#  endif
-#endif
-
-#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
-# include <Intrin.h>
-# include "arm_neon.h"
-# define CV_NEON 1
-# define CPU_HAS_NEON_FEATURE (true)
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
-#  include <arm_neon.h>
-#  define CV_NEON 1
-#endif
-
-#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__)
-#  define CV_VFP 1
-#endif
-
-#endif // __CUDACC__
-
-#ifndef CV_POPCNT
-#define CV_POPCNT 0
-#endif
-#ifndef CV_MMX
-#  define CV_MMX 0
-#endif
-#ifndef CV_SSE
-#  define CV_SSE 0
-#endif
-#ifndef CV_SSE2
-#  define CV_SSE2 0
-#endif
-#ifndef CV_SSE3
-#  define CV_SSE3 0
-#endif
-#ifndef CV_SSSE3
-#  define CV_SSSE3 0
-#endif
-#ifndef CV_SSE4_1
-#  define CV_SSE4_1 0
-#endif
-#ifndef CV_SSE4_2
-#  define CV_SSE4_2 0
-#endif
-#ifndef CV_AVX
-#  define CV_AVX 0
-#endif
-#ifndef CV_AVX2
-#  define CV_AVX2 0
-#endif
-#ifndef CV_FMA3
-#  define CV_FMA3 0
-#endif
-#ifndef CV_AVX_512F
-#  define CV_AVX_512F 0
-#endif
-#ifndef CV_AVX_512BW
-#  define CV_AVX_512BW 0
-#endif
-#ifndef CV_AVX_512CD
-#  define CV_AVX_512CD 0
-#endif
-#ifndef CV_AVX_512DQ
-#  define CV_AVX_512DQ 0
-#endif
-#ifndef CV_AVX_512ER
-#  define CV_AVX_512ER 0
-#endif
-#ifndef CV_AVX_512IFMA512
-#  define CV_AVX_512IFMA512 0
-#endif
-#ifndef CV_AVX_512PF
-#  define CV_AVX_512PF 0
-#endif
-#ifndef CV_AVX_512VBMI
-#  define CV_AVX_512VBMI 0
-#endif
-#ifndef CV_AVX_512VL
-#  define CV_AVX_512VL 0
-#endif
-
-#ifndef CV_NEON
-#  define CV_NEON 0
-#endif
-
-#ifndef CV_VFP
-#  define CV_VFP 0
-#endif
-
-/* primitive types */
-/*
-  schar  - signed 1 byte integer
-  uchar  - unsigned 1 byte integer
-  short  - signed 2 byte integer
-  ushort - unsigned 2 byte integer
-  int    - signed 4 byte integer
-  uint   - unsigned 4 byte integer
-  int64  - signed 8 byte integer
-  uint64 - unsigned 8 byte integer
-*/
-
-#if !defined _MSC_VER && !defined __BORLANDC__
-#  if defined __cplusplus && __cplusplus >= 201103L
-#    include <cstdint>
-     typedef std::uint32_t uint;
-#  else
-#    include <stdint.h>
-     typedef uint32_t uint;
-#  endif
-#else
-   typedef unsigned uint;
-#endif
-
-typedef signed char schar;
-
-#ifndef __IPL_H__
-   typedef unsigned char uchar;
-   typedef unsigned short ushort;
-#endif
-
-#if defined _MSC_VER || defined __BORLANDC__
-   typedef __int64 int64;
-   typedef unsigned __int64 uint64;
-#  define CV_BIG_INT(n)   n##I64
-#  define CV_BIG_UINT(n)  n##UI64
-#else
-   typedef int64_t int64;
-   typedef uint64_t uint64;
-#  define CV_BIG_INT(n)   n##LL
-#  define CV_BIG_UINT(n)  n##ULL
-#endif
-
 /* special informative macros for wrapper generators */
 #define CV_EXPORTS_W CV_EXPORTS
 #define CV_EXPORTS_W_SIMPLE CV_EXPORTS
@ -326,11 +90,6 @@ typedef signed char schar;
 #define CV_WRAP
 #define CV_WRAP_AS(synonym)

-/* fundamental constants */
-#define CV_PI   3.1415926535897932384626433832795
-#define CV_2PI 6.283185307179586476925286766559
-#define CV_LOG2 0.69314718055994530941723212145818
-
 /****************************************************************************************\
 *                                  Matrix type (Mat)                                     *
 \****************************************************************************************/
@ -417,19 +176,6 @@ typedef signed char schar;
 #define CV_ELEM_SIZE(type) \
    (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))

-
-/****************************************************************************************\
-*                                      fast math                                         *
-\****************************************************************************************/
-
-#if defined __BORLANDC__
-#  include <fastmath.h>
-#elif defined __cplusplus
-#  include <cmath>
-#else
-#  include <math.h>
-#endif
-
 #ifndef MIN
 #  define MIN(a,b)  ((a) > (b) ? (b) : (a))
 #endif
@ -438,164 +184,6 @@ typedef signed char schar;
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif

-#ifdef HAVE_TEGRA_OPTIMIZATION
-#  include "tegra_round.hpp"
-#endif
-
-//! @addtogroup core_utils
-//! @{
-
-#if CV_VFP
-// 1. general scheme
-#define ARM_ROUND(_value, _asm_string) \
-    int res; \
-    float temp; \
-    asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
-    return res;
-// 2. version for double
-#ifdef __clang__
-#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
-#else
-#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
-#endif
-// 3. version for float
-#define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
-#endif // CV_VFP
-
-/** @brief Rounds floating-point number to the nearest integer
-
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
- */
-CV_INLINE int cvRound( double value )
-{
-#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    return _mm_cvtsd_si32(t);
-#elif defined _MSC_VER && defined _M_IX86
-    int t;
-    __asm
-    {
-        fld value;
-        fistp t;
-    }
-    return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_DBL(value);
-#elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
-    ARM_ROUND_DBL(value)
-# else
-    return (int)lrint(value);
-# endif
-#else
-    double intpart, fractpart;
-    fractpart = modf(value, &intpart);
-    if ((fabs(fractpart) != 0.5) || ((((int)intpart) % 2) != 0))
-        return (int)(value + (value >= 0 ? 0.5 : -0.5));
-    else
-        return (int)intpart;
-#endif
-}
-
-#ifdef __cplusplus
-
-/** @overload */
-CV_INLINE int cvRound(float value)
-{
-#if defined ANDROID && (defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_FLT(value);
-#elif CV_VFP && !defined HAVE_TEGRA_OPTIMIZATION
-    ARM_ROUND_FLT(value)
-#else
-    return cvRound((double)value);
-#endif
-}
-
-/** @overload */
-CV_INLINE int cvRound(int value)
-{
-    return value;
-}
-
-#endif // __cplusplus
-
-/** @brief Rounds floating-point number to the nearest integer not larger than the original.
-
-The function computes an integer i such that:
-\f[i \le \texttt{value} < i+1\f]
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
- */
-CV_INLINE int cvFloor( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
-}
-
-/** @brief Rounds floating-point number to the nearest integer not larger than the original.
-
-The function computes an integer i such that:
-\f[i \le \texttt{value} < i+1\f]
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
-*/
-CV_INLINE int cvCeil( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
-}
-
-/** @brief Determines if the argument is Not A Number.
-
-@param value The input floating-point value
-
-The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
-otherwise. */
-CV_INLINE int cvIsNaN( double value )
-{
-    union { uint64 u; double f; } ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
-           ((unsigned)ieee754.u != 0) > 0x7ff00000;
-}
-
-/** @brief Determines if the argument is Infinity.
-
-@param value The input floating-point value
-
-The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
-and 0 otherwise. */
-CV_INLINE int cvIsInf( double value )
-{
-    union { uint64 u; double f; } ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
-           (unsigned)ieee754.u == 0;
-}
-
-//! @} core_utils
-
 /****************************************************************************************\
 *          exchange-add operation for atomic operations on reference counters            *
 \****************************************************************************************/
--- a/modules/core/include/opencv2/core/ippasync.hpp
+++ b/modules/core/include/opencv2/core/ippasync.hpp
@ -1,3 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
 #ifndef __OPENCV_CORE_IPPASYNC_HPP__
 #define __OPENCV_CORE_IPPASYNC_HPP__

--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -185,39 +185,43 @@ public:
    _InputArray(const UMat& um);
    _InputArray(const std::vector<UMat>& umv);

-    virtual Mat getMat(int idx=-1) const;
-    virtual UMat getUMat(int idx=-1) const;
-    virtual void getMatVector(std::vector<Mat>& mv) const;
-    virtual void getUMatVector(std::vector<UMat>& umv) const;
-    virtual cuda::GpuMat getGpuMat() const;
-    virtual ogl::Buffer getOGlBuffer() const;
-    void* getObj() const;
+    Mat getMat(int idx=-1) const;
+    Mat getMat_(int idx=-1) const;
+    UMat getUMat(int idx=-1) const;
+    void getMatVector(std::vector<Mat>& mv) const;
+    void getUMatVector(std::vector<UMat>& umv) const;
+    cuda::GpuMat getGpuMat() const;
+    ogl::Buffer getOGlBuffer() const;

-    virtual int kind() const;
-    virtual int dims(int i=-1) const;
-    virtual int cols(int i=-1) const;
-    virtual int rows(int i=-1) const;
-    virtual Size size(int i=-1) const;
-    virtual int sizend(int* sz, int i=-1) const;
-    virtual bool sameSize(const _InputArray& arr) const;
-    virtual size_t total(int i=-1) const;
-    virtual int type(int i=-1) const;
-    virtual int depth(int i=-1) const;
-    virtual int channels(int i=-1) const;
-    virtual bool isContinuous(int i=-1) const;
-    virtual bool isSubmatrix(int i=-1) const;
-    virtual bool empty() const;
-    virtual void copyTo(const _OutputArray& arr) const;
-    virtual void copyTo(const _OutputArray& arr, const _InputArray & mask) const;
-    virtual size_t offset(int i=-1) const;
-    virtual size_t step(int i=-1) const;
+    int getFlags() const;
+    void* getObj() const;
+    Size getSz() const;
+
+    int kind() const;
+    int dims(int i=-1) const;
+    int cols(int i=-1) const;
+    int rows(int i=-1) const;
+    Size size(int i=-1) const;
+    int sizend(int* sz, int i=-1) const;
+    bool sameSize(const _InputArray& arr) const;
+    size_t total(int i=-1) const;
+    int type(int i=-1) const;
+    int depth(int i=-1) const;
+    int channels(int i=-1) const;
+    bool isContinuous(int i=-1) const;
+    bool isSubmatrix(int i=-1) const;
+    bool empty() const;
+    void copyTo(const _OutputArray& arr) const;
+    void copyTo(const _OutputArray& arr, const _InputArray & mask) const;
+    size_t offset(int i=-1) const;
+    size_t step(int i=-1) const;
    bool isMat() const;
    bool isUMat() const;
    bool isMatVector() const;
    bool isUMatVector() const;
    bool isMatx() const;

-    virtual ~_InputArray();
+    ~_InputArray();

 protected:
    int flags;
@ -303,21 +307,21 @@ public:
    _OutputArray(const UMat& m);
    _OutputArray(const std::vector<UMat>& vec);

-    virtual bool fixedSize() const;
-    virtual bool fixedType() const;
-    virtual bool needed() const;
-    virtual Mat& getMatRef(int i=-1) const;
-    virtual UMat& getUMatRef(int i=-1) const;
-    virtual cuda::GpuMat& getGpuMatRef() const;
-    virtual ogl::Buffer& getOGlBufferRef() const;
-    virtual cuda::HostMem& getHostMemRef() const;
-    virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
-    virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
-    virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
-    virtual void createSameSize(const _InputArray& arr, int mtype) const;
-    virtual void release() const;
-    virtual void clear() const;
-    virtual void setTo(const _InputArray& value, const _InputArray & mask = _InputArray()) const;
+    bool fixedSize() const;
+    bool fixedType() const;
+    bool needed() const;
+    Mat& getMatRef(int i=-1) const;
+    UMat& getUMatRef(int i=-1) const;
+    cuda::GpuMat& getGpuMatRef() const;
+    ogl::Buffer& getOGlBufferRef() const;
+    cuda::HostMem& getHostMemRef() const;
+    void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    void createSameSize(const _InputArray& arr, int mtype) const;
+    void release() const;
+    void clear() const;
+    void setTo(const _InputArray& value, const _InputArray & mask = _InputArray()) const;

    void assign(const UMat& u) const;
    void assign(const Mat& m) const;
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -7,11 +7,13 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -61,6 +63,8 @@ inline void _InputArray::init(int _flags, const void* _obj, Size _sz)
 { flags = _flags; obj = (void*)_obj; sz = _sz; }

 inline void* _InputArray::getObj() const { return obj; }
+inline int _InputArray::getFlags() const { return flags; }
+inline Size _InputArray::getSz() const { return sz; }

 inline _InputArray::_InputArray() { init(NONE, 0); }
 inline _InputArray::_InputArray(int _flags, void* _obj) { init(_flags, _obj); }
@ -110,6 +114,13 @@ inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)

 inline _InputArray::~_InputArray() {}

+inline Mat _InputArray::getMat(int i) const
+{
+    if( kind() == MAT && i < 0 )
+        return *(const Mat*)obj;
+    return getMat_(i);
+}
+
 inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
 inline bool _InputArray::isUMat() const  { return kind() == _InputArray::UMAT; }
 inline bool _InputArray::isMatVector() const { return kind() == _InputArray::STD_VECTOR_MAT; }
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@ -427,7 +427,7 @@ template<typename _Tp, int m> struct Matx_DetOp
    double operator ()(const Matx<_Tp, m, m>& a) const
    {
        Matx<_Tp, m, m> temp = a;
-        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
+        double p = hal::LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
        if( p == 0 )
            return p;
        for( int i = 0; i < m; i++ )
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@ -12,6 +12,8 @@
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -70,9 +72,9 @@ template<typename _Tp, int m> struct Matx_FastInvOp
            b(i, i) = (_Tp)1;

        if( method == DECOMP_CHOLESKY )
-            return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+            return hal::Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);

-        return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+        return hal::LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
    }
 };

--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@ -136,14 +136,6 @@ namespace cv
 /* the alignment of all the allocated buffers */
 #define  CV_MALLOC_ALIGN    16

-#ifdef __GNUC__
-#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
-#elif defined _MSC_VER
-#  define CV_DECL_ALIGNED(x) __declspec(align(x))
-#else
-#  define CV_DECL_ALIGNED(x)
-#endif
-
 /* IEEE754 constants and macros */
 #define  CV_TOGGLE_FLT(x) ((x)^((int)(x) < 0 ? 0x7fffffff : 0))
 #define  CV_TOGGLE_DBL(x) ((x)^((int64)(x) < 0 ? CV_BIG_INT(0x7fffffffffffffff) : 0))
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@ -113,22 +113,6 @@ bytes of the header. In C++ interface the role of CvArr is played by InputArray
 */
 typedef void CvArr;

-typedef union Cv32suf
-{
-    int i;
-    unsigned u;
-    float f;
-}
-Cv32suf;
-
-typedef union Cv64suf
-{
-    int64 i;
-    uint64 u;
-    double f;
-}
-Cv64suf;
-
 typedef int CVStatus;

 /** @see cv::Error::Code */
--- a/modules/core/include/opencv2/core/version.hpp
+++ b/modules/core/include/opencv2/core/version.hpp
@ -10,8 +10,10 @@
 //                        Intel License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright( C) 2000, Intel Corporation, all rights reserved.
+// Copyright( C) 2000-2015, Intel Corporation, all rights reserved.
 // Copyright (C) 2011-2013, NVIDIA Corporation, all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
--- a/modules/core/src/algorithm.cpp
+++ b/modules/core/src/algorithm.cpp
@ -53,6 +53,20 @@ Algorithm::~Algorithm()
 {
 }

+void Algorithm::save(const String& filename) const
+{
+    FileStorage fs(filename, FileStorage::WRITE);
+    fs << getDefaultName() << "{";
+    fs << "format" << (int)3;
+    write(fs);
+    fs << "}";
+}
+
+String Algorithm::getDefaultName() const
+{
+    return String("my_object");
+}
+
 }

 /* End of file. */
--- a/modules/core/src/datastructs.cpp
+++ b/modules/core/src/datastructs.cpp
@ -651,7 +651,7 @@ icvGrowSeq( CvSeq *seq, int in_front_of )
        /* If there is a free space just after last allocated block
           and it is big enough then enlarge the last block.
           This can happen only if the new block is added to the end of sequence: */
-        if( (unsigned)(ICV_FREE_PTR(storage) - seq->block_max) < CV_STRUCT_ALIGN &&
+        if( (size_t)(ICV_FREE_PTR(storage) - seq->block_max) < CV_STRUCT_ALIGN &&
            storage->free_space >= seq->elem_size && !in_front_of )
        {
            int delta = storage->free_space / elem_size;
--- a/modules/core/src/kmeans.cpp
+++ b/modules/core/src/kmeans.cpp
@ -79,7 +79,7 @@ public:

        for ( int i = begin; i<end; i++ )
        {
-            tdist2[i] = std::min(normL2Sqr_(data + step*i, data + stepci, dims), dist[i]);
+            tdist2[i] = std::min(normL2Sqr(data + step*i, data + stepci, dims), dist[i]);
        }
    }

@ -114,7 +114,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,

    for( i = 0; i < N; i++ )
    {
-        dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
+        dist[i] = normL2Sqr(data + step*i, data + step*centers[0], dims);
        sum0 += dist[i];
    }

@ -189,7 +189,7 @@ public:
            for( int k = 0; k < K; k++ )
            {
                const float* center = centers.ptr<float>(k);
-                const double dist = normL2Sqr_(sample, center, dims);
+                const double dist = normL2Sqr(sample, center, dims);

                if( min_dist > dist )
                {
@ -384,7 +384,7 @@ double cv::kmeans( InputArray _data, int K,
                        if( labels[i] != max_k )
                            continue;
                        sample = data.ptr<float>(i);
-                        double dist = normL2Sqr_(sample, _old_center, dims);
+                        double dist = normL2Sqr(sample, _old_center, dims);

                        if( max_dist <= dist )
                        {
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@ -50,168 +50,6 @@
 namespace cv
 {

-/****************************************************************************************\
-*                     LU & Cholesky implementation for small matrices                    *
-\****************************************************************************************/
-
-template<typename _Tp> static inline int
-LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    int i, j, k, p = 1;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        k = i;
-
-        for( j = i+1; j < m; j++ )
-            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
-                k = j;
-
-        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
-            return 0;
-
-        if( k != i )
-        {
-            for( j = i; j < m; j++ )
-                std::swap(A[i*astep + j], A[k*astep + j]);
-            if( b )
-                for( j = 0; j < n; j++ )
-                    std::swap(b[i*bstep + j], b[k*bstep + j]);
-            p = -p;
-        }
-
-        _Tp d = -1/A[i*astep + i];
-
-        for( j = i+1; j < m; j++ )
-        {
-            _Tp alpha = A[j*astep + i]*d;
-
-            for( k = i+1; k < m; k++ )
-                A[j*astep + k] += alpha*A[i*astep + k];
-
-            if( b )
-                for( k = 0; k < n; k++ )
-                    b[j*bstep + k] += alpha*b[i*bstep + k];
-        }
-
-        A[i*astep + i] = -d;
-    }
-
-    if( b )
-    {
-        for( i = m-1; i >= 0; i-- )
-            for( j = 0; j < n; j++ )
-            {
-                _Tp s = b[i*bstep + j];
-                for( k = i+1; k < m; k++ )
-                    s -= A[i*astep + k]*b[k*bstep + j];
-                b[i*bstep + j] = s*A[i*astep + i];
-            }
-    }
-
-    return p;
-}
-
-
-int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-template<typename _Tp> static inline bool
-CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    _Tp* L = A;
-    int i, j, k;
-    double s;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < i; j++ )
-        {
-            s = A[i*astep + j];
-            for( k = 0; k < j; k++ )
-                s -= L[i*astep + k]*L[j*astep + k];
-            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
-        }
-        s = A[i*astep + i];
-        for( k = 0; k < j; k++ )
-        {
-            double t = L[i*astep + k];
-            s -= t*t;
-        }
-        if( s < std::numeric_limits<_Tp>::epsilon() )
-            return false;
-        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
-    }
-
-    if( !b )
-        return true;
-
-    // LLt x = b
-    // 1: L y = b
-    // 2. Lt x = y
-
-    /*
-     [ L00             ]  y0   b0
-     [ L10 L11         ]  y1 = b1
-     [ L20 L21 L22     ]  y2   b2
-     [ L30 L31 L32 L33 ]  y3   b3
-
-     [ L00 L10 L20 L30 ]  x0   y0
-     [     L11 L21 L31 ]  x1 = y1
-     [         L22 L32 ]  x2   y2
-     [             L33 ]  x3   y3
-    */
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = 0; k < i; k++ )
-                s -= L[i*astep + k]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    for( i = m-1; i >= 0; i-- )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = m-1; k > i; k-- )
-                s -= L[k*astep + i]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    return true;
-}
-
-
-bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-
 template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
 {
    a = std::abs(a);
@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_32F, (uchar*)buffer);
            mat.copyTo(a);

-            result = LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_64F, (uchar*)buffer);
            mat.copyTo(a);

-            result = LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
    setIdentity(dst);

    if( method == DECOMP_LU && type == CV_32F )
-        result = LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
    else if( method == DECOMP_LU && type == CV_64F )
-        result = LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
    else if( method == DECOMP_CHOLESKY && type == CV_32F )
-        result = Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
    else
-        result = Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);

    if( !result )
        dst = Scalar(0);
@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
    if( method == DECOMP_LU )
    {
        if( type == CV_32F )
-            result = LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
        else
-            result = LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
    }
    else if( method == DECOMP_CHOLESKY )
    {
        if( type == CV_32F )
-            result = Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
        else
-            result = Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
    }
    else
    {
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -1113,7 +1113,7 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
                                        Input/Output Array
 \*************************************************************************************************/

-Mat _InputArray::getMat(int i) const
+Mat _InputArray::getMat_(int i) const
 {
    int k = kind();
    int accessFlags = flags & ACCESS_MASK;
--- a/modules/core/src/out.cpp
+++ b/modules/core/src/out.cpp
@ -43,9 +43,9 @@

 #include "precomp.hpp"

-namespace
+namespace cv
 {
-    class FormattedImpl : public cv::Formatted
+    class FormattedImpl : public Formatted
    {
        enum { STATE_PROLOGUE, STATE_EPILOGUE, STATE_INTERLUDE,
               STATE_ROW_OPEN, STATE_ROW_CLOSE, STATE_CN_OPEN, STATE_CN_CLOSE, STATE_VALUE, STATE_FINISHED,
@ -55,7 +55,7 @@ namespace
        char floatFormat[8];
        char buf[32];   // enough for double with precision up to 20

-        cv::Mat mtx;
+        Mat mtx;
        int mcn; // == mtx.channels()
        bool singleLine;
        bool alignOrder;    // true when cn first order
@ -65,8 +65,8 @@ namespace
        int col;
        int cn;

-        cv::String prologue;
-        cv::String epilogue;
+        String prologue;
+        String epilogue;
        char braces[5];

        void (FormattedImpl::*valueToStr)();
@ -81,7 +81,7 @@ namespace

    public:

-        FormattedImpl(cv::String pl, cv::String el, cv::Mat m, char br[5], bool sLine, bool aOrder, int precision)
+        FormattedImpl(String pl, String el, Mat m, char br[5], bool sLine, bool aOrder, int precision)
        {
            CV_Assert(m.dims <= 2);

@ -253,7 +253,7 @@ namespace
        }
    };

-    class FormatterBase : public cv::Formatter
+    class FormatterBase : public Formatter
    {
    public:
        FormatterBase() : prec32f(8), prec64f(16), multiline(true) {}
@ -278,14 +278,15 @@ namespace
        int prec64f;
        int multiline;
    };
+
    class DefaultFormatter : public FormatterBase
    {
    public:

-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
        {
            char braces[5] = {'\0', '\0', ';', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>("[", "]", mtx, &*braces,
+            return makePtr<FormattedImpl>("[", "]", mtx, &*braces,
                mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
        }
    };
@ -294,10 +295,10 @@ namespace
    {
    public:

-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
        {
            char braces[5] = {'\0', '\0', ';', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>("", "", mtx, &*braces,
+            return makePtr<FormattedImpl>("", "", mtx, &*braces,
                mtx.rows == 1 || !multiline, true, mtx.depth() == CV_64F ? prec64f : prec32f );
        }
    };
@ -306,12 +307,12 @@ namespace
    {
    public:

-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
        {
-            char braces[5] = {'[', ']', '\0', '[', ']'};
+            char braces[5] = {'[', ']', ',', '[', ']'};
            if (mtx.cols == 1)
                braces[0] = braces[1] = '\0';
-            return cv::makePtr<FormattedImpl>("[", "]", mtx, &*braces,
+            return makePtr<FormattedImpl>("[", "]", mtx, &*braces,
                mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
        }
    };
@ -320,17 +321,17 @@ namespace
    {
    public:

-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
        {
            static const char* numpyTypes[] =
            {
                "uint8", "int8", "uint16", "int16", "int32", "float32", "float64", "uint64"
            };
-            char braces[5] = {'[', ']', '\0', '[', ']'};
+            char braces[5] = {'[', ']', ',', '[', ']'};
            if (mtx.cols == 1)
                braces[0] = braces[1] = '\0';
-            return cv::makePtr<FormattedImpl>("array([",
-                cv::format("], type='%s')", numpyTypes[mtx.depth()]), mtx, &*braces,
+            return makePtr<FormattedImpl>("array([",
+                cv::format("], dtype='%s')", numpyTypes[mtx.depth()]), mtx, &*braces,
                mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
        }
    };
@ -339,11 +340,11 @@ namespace
    {
    public:

-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
        {
            char braces[5] = {'\0', '\0', '\0', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>(cv::String(),
-                mtx.rows > 1 ? cv::String("\n") : cv::String(), mtx, &*braces,
+            return makePtr<FormattedImpl>(String(),
+                mtx.rows > 1 ? String("\n") : String(), mtx, &*braces,
                mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
        }
    };
@ -352,19 +353,14 @@ namespace
    {
    public:

-        cv::Ptr<cv::Formatted> format(const cv::Mat& mtx) const
+        Ptr<Formatted> format(const Mat& mtx) const
        {
            char braces[5] = {'\0', '\0', ',', '\0', '\0'};
-            return cv::makePtr<FormattedImpl>("{", "}", mtx, &*braces,
+            return makePtr<FormattedImpl>("{", "}", mtx, &*braces,
                mtx.rows == 1 || !multiline, false, mtx.depth() == CV_64F ? prec64f : prec32f );
        }
    };

-} // namespace
-
-
-namespace cv
-{
    Formatted::~Formatted() {}
    Formatter::~Formatter() {}

--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -55,6 +55,8 @@
 #include "opencv2/core/private.cuda.hpp"
 #include "opencv2/core/ocl.hpp"

+#include "opencv2/hal.hpp"
+
 #include <assert.h>
 #include <ctype.h>
 #include <float.h>
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@ -2416,274 +2416,6 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
 namespace cv
 {

-float normL2Sqr_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
-            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
-            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
-        }
-    }
-
-    for( ; j < n; j++ )
-    {
-        float t = a[j] - b[j];
-        d += t*t;
-    }
-    return d;
-}
-
-
-float normL1_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-        __m128 absmask = _mm_load_ps((const float*)absbuf);
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
-            d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#elif CV_NEON
-    float32x4_t v_sum = vdupq_n_f32(0.0f);
-    for ( ; j <= n - 4; j += 4)
-        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
-
-    float CV_DECL_ALIGNED(16) buf[4];
-    vst1q_f32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
-int normL1_(const uchar* a, const uchar* b, int n)
-{
-    int j = 0, d = 0;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        __m128i d0 = _mm_setzero_si128();
-
-        for( ; j <= n - 16; j += 16 )
-        {
-            __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
-            __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
-
-            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-        }
-
-        for( ; j <= n - 4; j += 4 )
-        {
-            __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
-            __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
-
-            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-        }
-        d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
-    }
-    else
-#elif CV_NEON
-    uint32x4_t v_sum = vdupq_n_u32(0.0f);
-    for ( ; j <= n - 16; j += 16)
-    {
-        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
-        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
-    }
-
-    uint CV_DECL_ALIGNED(16) buf[4];
-    vst1q_u32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
-static const uchar popCountTable[] =
-{
-    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
-static const uchar popCountTable2[] =
-{
-    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
-    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
-};
-
-static const uchar popCountTable4[] =
-{
-    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-};
-
-static int normHamming(const uchar* a, int n)
-{
-    int i = 0, result = 0;
-#if CV_NEON
-    {
-        uint32x4_t bits = vmovq_n_u32(0);
-        for (; i <= n - 16; i += 16) {
-            uint8x16_t A_vec = vld1q_u8 (a + i);
-            uint8x16_t bitsSet = vcntq_u8 (A_vec);
-            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
-            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-            bits = vaddq_u32(bits, bitSet4);
-        }
-        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
-        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
-        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
-    }
-#endif
-        for( ; i <= n - 4; i += 4 )
-            result += popCountTable[a[i]] + popCountTable[a[i+1]] +
-            popCountTable[a[i+2]] + popCountTable[a[i+3]];
-    for( ; i < n; i++ )
-        result += popCountTable[a[i]];
-    return result;
-}
-
-int normHamming(const uchar* a, const uchar* b, int n)
-{
-    int i = 0, result = 0;
-#if CV_NEON
-    {
-        uint32x4_t bits = vmovq_n_u32(0);
-        for (; i <= n - 16; i += 16) {
-            uint8x16_t A_vec = vld1q_u8 (a + i);
-            uint8x16_t B_vec = vld1q_u8 (b + i);
-            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
-            uint8x16_t bitsSet = vcntq_u8 (AxorB);
-            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
-            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-            bits = vaddq_u32(bits, bitSet4);
-        }
-        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
-        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
-        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
-    }
-#endif
-        for( ; i <= n - 4; i += 4 )
-            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
-                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
-    for( ; i < n; i++ )
-        result += popCountTable[a[i] ^ b[i]];
-    return result;
-}
-
-static int normHamming(const uchar* a, int n, int cellSize)
-{
-    if( cellSize == 1 )
-        return normHamming(a, n);
-    const uchar* tab = 0;
-    if( cellSize == 2 )
-        tab = popCountTable2;
-    else if( cellSize == 4 )
-        tab = popCountTable4;
-    else
-        CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
-    int i = 0, result = 0;
-#if CV_ENABLE_UNROLLED
-    for( ; i <= n - 4; i += 4 )
-        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
-#endif
-    for( ; i < n; i++ )
-        result += tab[a[i]];
-    return result;
-}
-
-int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
-{
-    if( cellSize == 1 )
-        return normHamming(a, b, n);
-    const uchar* tab = 0;
-    if( cellSize == 2 )
-        tab = popCountTable2;
-    else if( cellSize == 4 )
-        tab = popCountTable4;
-    else
-        CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
-    int i = 0, result = 0;
-    #if CV_ENABLE_UNROLLED
-    for( ; i <= n - 4; i += 4 )
-        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
-                tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
-    #endif
-    for( ; i < n; i++ )
-        result += tab[a[i] ^ b[i]];
-    return result;
-}
-
-
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
@ -2698,7 +2430,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result = std::max(result, ST(std::abs(src[k])));
+                    result = std::max(result, ST(cv_abs(src[k])));
            }
    }
    *_result = result;
@ -2719,7 +2451,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result += std::abs(src[k]);
+                    result += cv_abs(src[k]);
            }
    }
    *_result = result;
@ -2816,6 +2548,10 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
    return 0;
 }

+Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
+{
+    return cv::hal::normHamming(a, b, size);
+}

 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
    static int norm##L##_##suffix(const type* src, const uchar* mask, ntype* r, int len, int cn) \
@ -3164,10 +2900,14 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
                const uchar* data = src.ptr<uchar>();

                if( normType == NORM_HAMMING )
-                    return normHamming(data, (int)len);
+                {
+                    return hal::normHamming(data, (int)len);
+                }

                if( normType == NORM_HAMMING2 )
-                    return normHamming(data, (int)len, 2);
+                {
+                    return hal::normHamming(data, (int)len, 2);
+                }
            }
        }
    }
@ -3191,7 +2931,9 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
        int result = 0;

        for( size_t i = 0; i < it.nplanes; i++, ++it )
-            result += normHamming(ptrs[0], total, cellSize);
+        {
+            result += hal::normHamming(ptrs[0], total, cellSize);
+        }

        return result;
    }
@ -3673,7 +3415,9 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
        int result = 0;

        for( size_t i = 0; i < it.nplanes; i++, ++it )
-            result += normHamming(ptrs[0], ptrs[1], total, cellSize);
+        {
+            result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
+        }

        return result;
    }
@ -3810,13 +3554,18 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
-            dist[i] = normHamming(src1, src2 + step2*i, len);
+             dist[i] = hal::normHamming(src1, src2 + step2*i, len);
    }
    else
    {
        int val0 = INT_MAX;
        for( int i = 0; i < nvecs; i++ )
-            dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len) : val0;
+        {
+            if (mask[i])
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len);
+            else
+                dist[i] = val0;
+        }
    }
 }

@ -3827,13 +3576,18 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
-            dist[i] = normHamming(src1, src2 + step2*i, len, 2);
+            dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
    }
    else
    {
        int val0 = INT_MAX;
        for( int i = 0; i < nvecs; i++ )
-            dist[i] = mask[i] ? normHamming(src1, src2 + step2*i, len, 2) : val0;
+        {
+            if (mask[i])
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
+            else
+                dist[i] = val0;
+        }
    }
 }

--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@ -144,7 +144,11 @@ protected:

            depth = cvtest::randInt(rng) % (CV_64F+1);
            cn = cvtest::randInt(rng) % 4 + 1;
-            int sz[] = {cvtest::randInt(rng)%10+1, cvtest::randInt(rng)%10+1, cvtest::randInt(rng)%10+1};
+            int sz[] = {
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+            };
            MatND test_mat_nd(3, sz, CV_MAKETYPE(depth, cn));

            rng0.fill(test_mat_nd, CV_RAND_UNI, Scalar::all(ranges[depth][0]), Scalar::all(ranges[depth][1]));
@ -156,8 +160,12 @@ protected:
                multiply(test_mat_nd, test_mat_scale, test_mat_nd);
            }

-            int ssz[] = {cvtest::randInt(rng)%10+1, cvtest::randInt(rng)%10+1,
-                cvtest::randInt(rng)%10+1,cvtest::randInt(rng)%10+1};
+            int ssz[] = {
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+                static_cast<int>(cvtest::randInt(rng)%10+1),
+            };
            SparseMat test_sparse_mat = cvTsGetRandomSparseMat(4, ssz, cvtest::randInt(rng)%(CV_64F+1),
                                                               cvtest::randInt(rng) % 10000, 0, 100, rng);

--- a/modules/cudawarping/perf/perf_warping.cpp
+++ b/modules/cudawarping/perf/perf_warping.cpp
@ -253,7 +253,7 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, WarpAffine,
    const double aplha = CV_PI / 4;
    const double mat[2 * 3] =
    {
-        std::cos(aplha), -std::sin(aplha), src.cols / 2,
+        std::cos(aplha), -std::sin(aplha), static_cast<double>(src.cols) / 2.0,
        std::sin(aplha),  std::cos(aplha), 0
    };
    const cv::Mat M(2, 3, CV_64F, (void*) mat);
@ -301,7 +301,7 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, WarpPerspective,
    declare.in(src, WARMUP_RNG);

    const double aplha = CV_PI / 4;
-    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
+    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), static_cast<double>(src.cols) / 2.0},
                         {std::sin(aplha),  std::cos(aplha), 0},
                         {0.0,              0.0,             1.0}};
    const cv::Mat M(3, 3, CV_64F, (void*) mat);
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -730,38 +730,6 @@ struct CV_EXPORTS L1
    }
 };

-/*
- * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
- * bit count of A exclusive XOR'ed with B
- */
-struct CV_EXPORTS Hamming
-{
-    enum { normType = NORM_HAMMING };
-    typedef unsigned char ValueType;
-    typedef int ResultType;
-
-    /** this will count the bits in a ^ b
-     */
-    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const
-    {
-        return normHamming(a, b, size);
-    }
-};
-
-typedef Hamming HammingLUT;
-
-template<int cellsize> struct HammingMultilevel
-{
-    enum { normType = NORM_HAMMING + (cellsize>1) };
-    typedef unsigned char ValueType;
-    typedef int ResultType;
-
-    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const
-    {
-        return normHamming(a, b, size, cellsize);
-    }
-};
-
 /****************************************************************************************\
 *                                  DescriptorMatcher                                     *
 \****************************************************************************************/
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@ -812,7 +812,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE
      }
    }
  }
-  fastAtan2(resY, resX, Ang, ang_size, false);
+  hal::fastAtan2(resY, resX, Ang, ang_size, false);
  // Loop slides pi/3 window around feature point
  for (ang1 = 0; ang1 < (float)(2.0 * CV_PI); ang1 += 0.15f) {
    ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
--- a/modules/flann/include/opencv2/flann/miniflann.hpp
+++ b/modules/flann/include/opencv2/flann/miniflann.hpp
@ -89,13 +89,13 @@ struct CV_EXPORTS LinearIndexParams : public IndexParams
 struct CV_EXPORTS CompositeIndexParams : public IndexParams
 {
    CompositeIndexParams(int trees = 4, int branching = 32, int iterations = 11,
-                         cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2 );
+                         cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2f );
 };

 struct CV_EXPORTS AutotunedIndexParams : public IndexParams
 {
-    AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01,
-                         float memory_weight = 0, float sample_fraction = 0.1);
+    AutotunedIndexParams(float target_precision = 0.8f, float build_weight = 0.01f,
+                         float memory_weight = 0, float sample_fraction = 0.1f);
 };

 struct CV_EXPORTS HierarchicalClusteringIndexParams : public IndexParams
@ -107,7 +107,7 @@ struct CV_EXPORTS HierarchicalClusteringIndexParams : public IndexParams
 struct CV_EXPORTS KMeansIndexParams : public IndexParams
 {
    KMeansIndexParams(int branching = 32, int iterations = 11,
-                      cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2 );
+                      cvflann::flann_centers_init_t centers_init = cvflann::FLANN_CENTERS_RANDOM, float cb_index = 0.2f );
 };

 struct CV_EXPORTS LshIndexParams : public IndexParams
--- a/modules/hal/CMakeLists.txt
+++ b/modules/hal/CMakeLists.txt
@ -0,0 +1,12 @@
+set(the_description "The Hardware Acceleration Layer (HAL) module")
+
+set(OPENCV_MODULE_TYPE STATIC)
+# set(OPENCV_MODULE_IS_PART_OF_WORLD FALSE)
+
+if(UNIX)
+  if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+  endif()
+endif()
+
+ocv_define_module(hal)
--- a/modules/hal/include/opencv2/hal.hpp
+++ b/modules/hal/include/opencv2/hal.hpp
@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_HPP__
+#define __OPENCV_HAL_HPP__
+
+#include "opencv2/hal/defs.h"
+
+/**
+  @defgroup hal Hardware Acceleration Layer
+*/
+
+namespace cv { namespace hal {
+
+namespace Error {
+
+enum
+{
+    Ok = 0,
+    Unknown = -1
+};
+
+}
+
+int normHamming(const uchar* a, int n);
+int normHamming(const uchar* a, const uchar* b, int n);
+
+int normHamming(const uchar* a, int n, int cellSize);
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
+
+//////////////////////////////// low-level functions ////////////////////////////////
+
+int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+int normL1_(const uchar* a, const uchar* b, int n);
+float normL1_(const float* a, const float* b, int n);
+float normL2Sqr_(const float* a, const float* b, int n);
+
+void exp(const float* src, float* dst, int n);
+void exp(const double* src, double* dst, int n);
+void log(const float* src, float* dst, int n);
+void log(const double* src, double* dst, int n);
+
+void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+void magnitude(const float* x, const float* y, float* dst, int n);
+void magnitude(const double* x, const double* y, double* dst, int n);
+void sqrt(const float* src, float* dst, int len);
+void sqrt(const double* src, double* dst, int len);
+void invSqrt(const float* src, float* dst, int len);
+void invSqrt(const double* src, double* dst, int len);
+
+}} //cv::hal
+
+#endif //__OPENCV_HAL_HPP__
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@ -0,0 +1,675 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DEF_H__
+#define __OPENCV_DEF_H__
+
+#if !defined _CRT_SECURE_NO_DEPRECATE && defined _MSC_VER && _MSC_VER > 1300
+#  define _CRT_SECURE_NO_DEPRECATE /* to avoid multiple Visual Studio warnings */
+#endif
+
+#include <limits.h>
+
+#if defined __ICL
+#  define CV_ICC   __ICL
+#elif defined __ICC
+#  define CV_ICC   __ICC
+#elif defined __ECL
+#  define CV_ICC   __ECL
+#elif defined __ECC
+#  define CV_ICC   __ECC
+#elif defined __INTEL_COMPILER
+#  define CV_ICC   __INTEL_COMPILER
+#endif
+
+#ifndef CV_INLINE
+#  if defined __cplusplus
+#    define CV_INLINE static inline
+#  elif defined _MSC_VER
+#    define CV_INLINE __inline
+#  else
+#    define CV_INLINE static
+#  endif
+#endif
+
+#if defined CV_ICC && !defined CV_ENABLE_UNROLLED
+#  define CV_ENABLE_UNROLLED 0
+#else
+#  define CV_ENABLE_UNROLLED 1
+#endif
+
+#ifdef __GNUC__
+#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined _MSC_VER
+#  define CV_DECL_ALIGNED(x) __declspec(align(x))
+#else
+#  define CV_DECL_ALIGNED(x)
+#endif
+
+/* CPU features and intrinsics support */
+#define CV_CPU_NONE             0
+#define CV_CPU_MMX              1
+#define CV_CPU_SSE              2
+#define CV_CPU_SSE2             3
+#define CV_CPU_SSE3             4
+#define CV_CPU_SSSE3            5
+#define CV_CPU_SSE4_1           6
+#define CV_CPU_SSE4_2           7
+#define CV_CPU_POPCNT           8
+
+#define CV_CPU_AVX              10
+#define CV_CPU_AVX2             11
+#define CV_CPU_FMA3             12
+
+#define CV_CPU_AVX_512F         13
+#define CV_CPU_AVX_512BW        14
+#define CV_CPU_AVX_512CD        15
+#define CV_CPU_AVX_512DQ        16
+#define CV_CPU_AVX_512ER        17
+#define CV_CPU_AVX_512IFMA512   18
+#define CV_CPU_AVX_512PF        19
+#define CV_CPU_AVX_512VBMI      20
+#define CV_CPU_AVX_512VL        21
+
+#define CV_CPU_NEON   100
+
+// when adding to this list remember to update the enum in core/utility.cpp
+#define CV_HARDWARE_MAX_FEATURE 255
+
+// do not include SSE/AVX/NEON headers for NVCC compiler
+#ifndef __CUDACC__
+
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#  if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <pmmintrin.h>
+#    define CV_SSE3 1
+#  endif
+#  if defined __SSSE3__  || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <tmmintrin.h>
+#    define CV_SSSE3 1
+#  endif
+#  if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <smmintrin.h>
+#    define CV_SSE4_1 1
+#  endif
+#  if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    include <nmmintrin.h>
+#    define CV_SSE4_2 1
+#  endif
+#  if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
+#    ifdef _MSC_VER
+#      include <nmmintrin.h>
+#    else
+#      include <popcntintrin.h>
+#    endif
+#    define CV_POPCNT 1
+#  endif
+#  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
+// MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
+// See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
+#    include <immintrin.h>
+#    define CV_AVX 1
+#    if defined(_XCR_XFEATURE_ENABLED_MASK)
+#      define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
+#    else
+#      define __xgetbv() 0
+#    endif
+#  endif
+#  if defined __AVX2__ || (defined _MSC_VER && _MSC_VER >= 1800 && 0)
+#    include <immintrin.h>
+#    define CV_AVX2 1
+#    if defined __FMA__
+#      define CV_FMA3 1
+#    endif
+#  endif
+#endif
+
+#if (defined WIN32 || defined _WIN32) && defined(_M_ARM)
+# include <Intrin.h>
+# include "arm_neon.h"
+# define CV_NEON 1
+# define CPU_HAS_NEON_FEATURE (true)
+#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#endif
+
+#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__)
+#  define CV_VFP 1
+#endif
+
+#endif // __CUDACC__
+
+#ifndef CV_POPCNT
+#define CV_POPCNT 0
+#endif
+#ifndef CV_MMX
+#  define CV_MMX 0
+#endif
+#ifndef CV_SSE
+#  define CV_SSE 0
+#endif
+#ifndef CV_SSE2
+#  define CV_SSE2 0
+#endif
+#ifndef CV_SSE3
+#  define CV_SSE3 0
+#endif
+#ifndef CV_SSSE3
+#  define CV_SSSE3 0
+#endif
+#ifndef CV_SSE4_1
+#  define CV_SSE4_1 0
+#endif
+#ifndef CV_SSE4_2
+#  define CV_SSE4_2 0
+#endif
+#ifndef CV_AVX
+#  define CV_AVX 0
+#endif
+#ifndef CV_AVX2
+#  define CV_AVX2 0
+#endif
+#ifndef CV_FMA3
+#  define CV_FMA3 0
+#endif
+#ifndef CV_AVX_512F
+#  define CV_AVX_512F 0
+#endif
+#ifndef CV_AVX_512BW
+#  define CV_AVX_512BW 0
+#endif
+#ifndef CV_AVX_512CD
+#  define CV_AVX_512CD 0
+#endif
+#ifndef CV_AVX_512DQ
+#  define CV_AVX_512DQ 0
+#endif
+#ifndef CV_AVX_512ER
+#  define CV_AVX_512ER 0
+#endif
+#ifndef CV_AVX_512IFMA512
+#  define CV_AVX_512IFMA512 0
+#endif
+#ifndef CV_AVX_512PF
+#  define CV_AVX_512PF 0
+#endif
+#ifndef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 0
+#endif
+#ifndef CV_AVX_512VL
+#  define CV_AVX_512VL 0
+#endif
+
+#ifndef CV_NEON
+#  define CV_NEON 0
+#endif
+
+#ifndef CV_VFP
+#  define CV_VFP 0
+#endif
+
+/* primitive types */
+/*
+  schar  - signed 1 byte integer
+  uchar  - unsigned 1 byte integer
+  short  - signed 2 byte integer
+  ushort - unsigned 2 byte integer
+  int    - signed 4 byte integer
+  uint   - unsigned 4 byte integer
+  int64  - signed 8 byte integer
+  uint64 - unsigned 8 byte integer
+*/
+
+#if !defined _MSC_VER && !defined __BORLANDC__
+#  if defined __cplusplus && __cplusplus >= 201103L
+#    include <cstdint>
+     typedef std::uint32_t uint;
+#  else
+#    include <stdint.h>
+     typedef uint32_t uint;
+#  endif
+#else
+   typedef unsigned uint;
+#endif
+
+typedef signed char schar;
+
+#ifndef __IPL_H__
+   typedef unsigned char uchar;
+   typedef unsigned short ushort;
+#endif
+
+#if defined _MSC_VER || defined __BORLANDC__
+   typedef __int64 int64;
+   typedef unsigned __int64 uint64;
+#  define CV_BIG_INT(n)   n##I64
+#  define CV_BIG_UINT(n)  n##UI64
+#else
+   typedef int64_t int64;
+   typedef uint64_t uint64;
+#  define CV_BIG_INT(n)   n##LL
+#  define CV_BIG_UINT(n)  n##ULL
+#endif
+
+/* fundamental constants */
+#define CV_PI   3.1415926535897932384626433832795
+#define CV_2PI 6.283185307179586476925286766559
+#define CV_LOG2 0.69314718055994530941723212145818
+
+typedef union Cv32suf
+{
+    int i;
+    unsigned u;
+    float f;
+}
+Cv32suf;
+
+typedef union Cv64suf
+{
+    int64 i;
+    uint64 u;
+    double f;
+}
+Cv64suf;
+
+
+/****************************************************************************************\
+*                                      fast math                                         *
+\****************************************************************************************/
+
+#if defined __BORLANDC__
+#  include <fastmath.h>
+#elif defined __cplusplus
+#  include <cmath>
+#else
+#  include <math.h>
+#endif
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+#  include "tegra_round.hpp"
+#endif
+
+//! @addtogroup core_utils
+//! @{
+
+#if CV_VFP
+    // 1. general scheme
+    #define ARM_ROUND(_value, _asm_string) \
+        int res; \
+        float temp; \
+        asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
+        return res
+    // 2. version for double
+    #ifdef __clang__
+        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
+    #else
+        #define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
+    #endif
+    // 3. version for float
+    #define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+#endif // CV_VFP
+
+/** @brief Rounds floating-point number to the nearest integer
+
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int
+cvRound( double value )
+{
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    return _mm_cvtsd_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND_DBL(value);
+#elif defined CV_ICC || defined __GNUC__
+# if CV_VFP
+    ARM_ROUND_DBL(value);
+# else
+    return (int)lrint(value);
+# endif
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+       the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5 : -0.5));
+#endif
+}
+
+
+/** @brief Rounds floating-point number to the nearest integer not larger than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvFloor( double value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    int i = _mm_cvtsd_si32(t);
+    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i - (i > value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(value - i);
+    return i - (diff < 0);
+#endif
+}
+
+/** @brief Rounds floating-point number to the nearest integer not larger than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvCeil( double value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    int i = _mm_cvtsd_si32(t);
+    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i + (i < value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(i - value);
+    return i + (diff < 0);
+#endif
+}
+
+/** @brief Determines if the argument is Not A Number.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
+ otherwise. */
+CV_INLINE int cvIsNaN( double value )
+{
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
+           ((unsigned)ieee754.u != 0) > 0x7ff00000;
+}
+
+/** @brief Determines if the argument is Infinity.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
+ and 0 otherwise. */
+CV_INLINE int cvIsInf( double value )
+{
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
+            (unsigned)ieee754.u == 0;
+}
+
+#ifdef __cplusplus
+
+/** @overload */
+CV_INLINE int cvRound(float value)
+{
+#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && \
+      defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    return _mm_cvtss_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
+        defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
+    TEGRA_ROUND_FLT(value);
+#elif defined CV_ICC || defined __GNUC__
+# if CV_VFP
+    ARM_ROUND_FLT(value);
+# else
+    return (int)lrintf(value);
+# endif
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+     the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5f : -0.5f));
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvRound( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvFloor( float value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    int i = _mm_cvtss_si32(t);
+    return i - _mm_movemask_ps(_mm_cmplt_ss(t, _mm_cvtsi32_ss(t,i)));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i - (i > value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(value - i);
+    return i - (diff < 0);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvFloor( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvCeil( float value )
+{
+#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    int i = _mm_cvtss_si32(t);
+    return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t));
+#elif defined __GNUC__
+    int i = (int)value;
+    return i + (i < value);
+#else
+    int i = cvRound(value);
+    float diff = (float)(i - value);
+    return i + (diff < 0);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvCeil( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvIsNaN( float value )
+{
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) > 0x7f800000;
+}
+
+/** @overload */
+CV_INLINE int cvIsInf( float value )
+{
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) == 0x7f800000;
+}
+
+#include <algorithm>
+
+namespace cv
+{
+
+/////////////// saturate_cast (used in image & signal processing) ///////////////////
+
+/**
+ Template function for accurate conversion from one primitive type to another.
+
+ The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
+ and others. They perform an efficient and accurate conversion from one primitive type to another
+ (see the introduction chapter). saturate in the name means that when the input value v is out of the
+ range of the target type, the result is not formed just by taking low bits of the input, but instead
+ the value is clipped. For example:
+ @code
+ uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
+ short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
+ @endcode
+ Such clipping is done when the target type is unsigned char , signed char , unsigned short or
+ signed short . For 32-bit integers, no clipping is done.
+
+ When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
+ the floating-point value is first rounded to the nearest integer and then clipped if needed (when
+ the target type is 8- or 16-bit).
+
+ This operation is used in the simplest or most complex image processing functions in OpenCV.
+
+ @param v Function parameter.
+ @sa add, subtract, multiply, divide, Mat::convertTo
+ */
+template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
+
+//! @cond IGNORED
+
+template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
+template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
+template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
+
+template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
+template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
+
+template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
+template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
+
+template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
+template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
+template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
+
+template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
+template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
+
+// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
+template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
+
+//! @endcond
+
+}
+
+#endif // __cplusplus
+
+//! @} core_utils
+
+#endif //__OPENCV_HAL_H__
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
@ -0,0 +1,292 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_INTRIN_HPP__
+#define __OPENCV_HAL_INTRIN_HPP__
+
+#include <cmath>
+#include <float.h>
+#include <stdlib.h>
+
+#define OPENCV_HAL_ADD(a, b) ((a) + (b))
+#define OPENCV_HAL_AND(a, b) ((a) & (b))
+#define OPENCV_HAL_NOP(a) (a)
+#define OPENCV_HAL_1ST(a, b) (a)
+
+// unlike HAL API, which is in cv::hall,
+// we put intrinsics into cv namespace to make its
+// access from within opencv code more accessible
+namespace cv {
+
+template<typename _Tp> struct V_TypeTraits
+{
+    typedef _Tp int_type;
+    typedef _Tp uint_type;
+    typedef _Tp abs_type;
+    typedef _Tp sum_type;
+
+    enum { delta = 0, shift = 0 };
+
+    static int_type reinterpret_int(_Tp x) { return x; }
+    static uint_type reinterpet_uint(_Tp x) { return x; }
+    static _Tp reinterpret_from_int(int_type x) { return (_Tp)x; }
+};
+
+template<> struct V_TypeTraits<uchar>
+{
+    typedef uchar value_type;
+    typedef schar int_type;
+    typedef uchar uint_type;
+    typedef uchar abs_type;
+    typedef int sum_type;
+
+    typedef ushort w_type;
+
+    enum { delta = 128, shift = 8 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<schar>
+{
+    typedef schar value_type;
+    typedef schar int_type;
+    typedef uchar uint_type;
+    typedef uchar abs_type;
+    typedef int sum_type;
+
+    typedef short w_type;
+
+    enum { delta = 128, shift = 8 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<ushort>
+{
+    typedef ushort value_type;
+    typedef short int_type;
+    typedef ushort uint_type;
+    typedef ushort abs_type;
+    typedef int sum_type;
+
+    typedef unsigned w_type;
+    typedef uchar nu_type;
+
+    enum { delta = 32768, shift = 16 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<short>
+{
+    typedef short value_type;
+    typedef short int_type;
+    typedef ushort uint_type;
+    typedef ushort abs_type;
+    typedef int sum_type;
+
+    typedef int w_type;
+    typedef uchar nu_type;
+    typedef schar n_type;
+
+    enum { delta = 128, shift = 8 };
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<unsigned>
+{
+    typedef unsigned value_type;
+    typedef int int_type;
+    typedef unsigned uint_type;
+    typedef unsigned abs_type;
+    typedef unsigned sum_type;
+
+    typedef uint64 w_type;
+    typedef ushort nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<int>
+{
+    typedef int value_type;
+    typedef int int_type;
+    typedef unsigned uint_type;
+    typedef unsigned abs_type;
+    typedef int sum_type;
+
+    typedef int64 w_type;
+    typedef short n_type;
+    typedef ushort nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<uint64>
+{
+    typedef uint64 value_type;
+    typedef int64 int_type;
+    typedef uint64 uint_type;
+    typedef uint64 abs_type;
+    typedef uint64 sum_type;
+
+    typedef unsigned nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+template<> struct V_TypeTraits<int64>
+{
+    typedef int64 value_type;
+    typedef int64 int_type;
+    typedef uint64 uint_type;
+    typedef uint64 abs_type;
+    typedef int64 sum_type;
+
+    typedef int nu_type;
+
+    static int_type reinterpret_int(value_type x) { return (int_type)x; }
+    static uint_type reinterpret_uint(value_type x) { return (uint_type)x; }
+    static value_type reinterpret_from_int(int_type x) { return (value_type)x; }
+};
+
+
+template<> struct V_TypeTraits<float>
+{
+    typedef float value_type;
+    typedef int int_type;
+    typedef unsigned uint_type;
+    typedef float abs_type;
+    typedef float sum_type;
+
+    typedef double w_type;
+
+    static int_type reinterpret_int(value_type x)
+    {
+        Cv32suf u;
+        u.f = x;
+        return u.i;
+    }
+    static uint_type reinterpet_uint(value_type x)
+    {
+        Cv32suf u;
+        u.f = x;
+        return u.u;
+    }
+    static value_type reinterpret_from_int(int_type x)
+    {
+        Cv32suf u;
+        u.i = x;
+        return u.f;
+    }
+};
+
+template<> struct V_TypeTraits<double>
+{
+    typedef double value_type;
+    typedef int64 int_type;
+    typedef uint64 uint_type;
+    typedef double abs_type;
+    typedef double sum_type;
+    static int_type reinterpret_int(value_type x)
+    {
+        Cv64suf u;
+        u.f = x;
+        return u.i;
+    }
+    static uint_type reinterpet_uint(value_type x)
+    {
+        Cv64suf u;
+        u.f = x;
+        return u.u;
+    }
+    static value_type reinterpret_from_int(int_type x)
+    {
+        Cv64suf u;
+        u.i = x;
+        return u.f;
+    }
+};
+
+}
+
+#if CV_SSE2
+
+#include "opencv2/hal/intrin_sse.hpp"
+
+#elif CV_NEON
+
+#include "opencv2/hal/intrin_neon.hpp"
+
+#else
+
+#include "opencv2/hal/intrin_cpp.hpp"
+
+#endif
+
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif
+
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif
+
+#endif
--- a/modules/hal/include/opencv2/hal/intrin_cpp.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_cpp.hpp
@ -0,0 +1,811 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
+#define __OPENCV_HAL_INTRIN_CPP_HPP__
+
+namespace cv
+{
+
+template<typename _Tp, int n> struct v_reg
+{
+    typedef _Tp lane_type;
+    typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
+    typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
+    enum { nlanes = n };
+
+    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
+    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+    }
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
+           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
+           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
+        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
+    }
+
+    v_reg() {}
+    v_reg(const v_reg<_Tp, n> & r)
+    {
+        for( int i = 0; i < n; i++ )
+            s[i] = r.s[i];
+    }
+
+    _Tp get(const int i) const { return s[i]; }
+    _Tp get0() const { return s[0]; }
+    v_reg<_Tp, n> high() const
+    {
+        v_reg<_Tp, n> c;
+        int i;
+        for( i = 0; i < n/2; i++ )
+        {
+            c.s[i] = s[i+(n/2)];
+            c.s[i+(n/2)] = 0;
+        }
+        return c;
+    }
+
+    static v_reg<_Tp, n> zero()
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = (_Tp)0;
+        return c;
+    }
+
+    static v_reg<_Tp, n> all(_Tp s)
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = s;
+        return c;
+    }
+
+    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
+    {
+        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
+        v_reg<_Tp2, n2> c;
+        memcpy(&c.s[0], &s[0], bytes);
+        return c;
+    }
+
+    _Tp s[n];
+};
+
+#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> \
+    operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return c; \
+} \
+template<typename _Tp, int n> inline v_reg<_Tp, n>& \
+    operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_BIN_OP(+)
+OPENCV_HAL_IMPL_BIN_OP(-)
+OPENCV_HAL_IMPL_BIN_OP(*)
+OPENCV_HAL_IMPL_BIN_OP(/)
+
+#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
+    (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return c; \
+} \
+template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
+    bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_BIT_OP(&)
+OPENCV_HAL_IMPL_BIT_OP(|)
+OPENCV_HAL_IMPL_BIT_OP(^)
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
+        return c;
+}
+
+#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
+template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp2, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
+                          typename V_TypeTraits<_Tp>::abs_type)
+OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
+OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
+
+#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i], b.s[i]); \
+    return c; \
+} \
+template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
+{ \
+    _Tp c = a.s[0]; \
+    for( int i = 1; i < n; i++ ) \
+        c = cfunc(c, a.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min)
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max)
+
+template<typename _Tp, int n>
+inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
+{
+    for( int i = 0; i < n; i++ )
+    {
+        minval.s[i] = std::min(a.s[i], b.s[i]);
+        maxval.s[i] = std::max(a.s[i], b.s[i]);
+    }
+}
+
+
+#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_CMP_OP(<)
+OPENCV_HAL_IMPL_CMP_OP(>)
+OPENCV_HAL_IMPL_CMP_OP(<=)
+OPENCV_HAL_IMPL_CMP_OP(>=)
+OPENCV_HAL_IMPL_CMP_OP(==)
+OPENCV_HAL_IMPL_CMP_OP(!=)
+
+#define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef _Tp2 rtype; \
+    v_reg<rtype, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
+OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type)
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = 1.f/std::sqrt(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
+    return c;
+}
+
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
+    return c;
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                              const v_reg<_Tp, n>& c)
+{
+    v_reg<_Tp, n> d;
+    for( int i = 0; i < n; i++ )
+        d.s[i] = a.s[i]*b.s[i] + c.s[i];
+    return d;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+    v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i]*b.s[i]*2;
+        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
+                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
+    }
+}
+
+#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_SHIFT_OP(<<)
+OPENCV_HAL_IMPL_SHIFT_OP(>>)
+
+template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
+{
+    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
+    for( int i = 1; i < n; i++ )
+        c += a.s[i];
+    return c;
+}
+
+template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
+{
+    int mask = 0;
+    for( int i = 0; i < n; i++ )
+        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
+    return mask;
+}
+
+template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
+            return false;
+    return true;
+}
+
+template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
+            return true;
+    return false;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
+                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i];
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        b0.s[i] = a.s[i];
+        b1.s[i] = a.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
+    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
+    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
+{
+    int i;
+    for( i = 0; i < n/2; i++ )
+    {
+        b0.s[i*2] = a0.s[i];
+        b0.s[i*2+1] = a1.s[i];
+    }
+    for( ; i < n; i++ )
+    {
+        b1.s[i*2-n] = a0.s[i];
+        b1.s[i*2-n+1] = a1.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr)
+{
+    return v_reg<_Tp, n>(ptr);
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr)
+{
+    return v_reg<_Tp, n>(ptr);
+}
+
+template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n/2; i++ )
+    {
+        c.s[i] = loptr[i];
+        c.s[i+n/2] = hiptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename
+    V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr)
+{
+    typedef typename V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type w_type;
+    v_reg<w_type, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
+{
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        a.s[i] = ptr[i3];
+        b.s[i] = ptr[i3+1];
+        c.s[i] = ptr[i3+2];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
+                                v_reg<_Tp, n>& d)
+{
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        a.s[i] = ptr[i4];
+        b.s[i] = ptr[i4+1];
+        c.s[i] = ptr[i4+2];
+        d.s[i] = ptr[i4+3];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
+{
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        ptr[i3] = a.s[i];
+        ptr[i3+1] = b.s[i];
+        ptr[i3+2] = c.s[i];
+    }
+}
+
+template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                                            const v_reg<_Tp, n>& d)
+{
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        ptr[i4] = a.s[i];
+        ptr[i4+1] = b.s[i];
+        ptr[i4+2] = c.s[i];
+        ptr[i4+3] = d.s[i];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i+(n/2)];
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i];
+        c.s[i+(n/2)] = b.s[i];
+    }
+}
+
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i+(n/2)];
+        c.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+template<typename _Tp, int n>
+inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        low.s[i] = a.s[i];
+        low.s[i+(n/2)] = b.s[i];
+        high.s[i] = a.s[i+(n/2)];
+        high.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvRound(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvFloor(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvCeil(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (int)(a.s[i]);
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvFloor(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (float)a.s[i];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+template<typename _Tp>
+inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
+                            const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
+                            v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
+                            v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
+{
+    b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
+    b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
+    b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
+    b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
+}
+
+typedef v_reg<uchar, 16> v_uint8x16;
+typedef v_reg<schar, 16> v_int8x16;
+typedef v_reg<ushort, 8> v_uint16x8;
+typedef v_reg<short, 8> v_int16x8;
+typedef v_reg<unsigned, 4> v_uint32x4;
+typedef v_reg<int, 4> v_int32x4;
+typedef v_reg<float, 4> v_float32x4;
+typedef v_reg<float, 8> v_float32x8;
+typedef v_reg<double, 2> v_float64x2;
+typedef v_reg<uint64, 2> v_uint64x2;
+typedef v_reg<int64, 2> v_int64x2;
+
+#define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \
+inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
+template<typename _Tp0, int n0> inline _Tpvec \
+    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
+{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); }
+
+OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64)
+
+#define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return a << n; } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return a >> n; } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ \
+    _Tpvec c; \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short)
+OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int)
+OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64)
+
+
+#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
+inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpnvec c; \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    { \
+        c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
+        c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
+    } \
+    return c; \
+} \
+template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpnvec c; \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+    { \
+        c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+        c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+    } \
+    return c; \
+} \
+inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+{ \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
+} \
+template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
+{ \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
+}
+
+OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
+OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
+OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
+OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack)
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
+                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
+                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
+                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
+}
+
+}
+
+#endif
--- a/modules/hal/include/opencv2/hal/intrin_neon.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_neon.hpp
@ -0,0 +1,823 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_INTRIN_NEON_HPP__
+#define __OPENCV_HAL_INTRIN_NEON_HPP__
+
+namespace cv
+{
+
+#define CV_SIMD128 1
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(uint8x16_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_u8(v);
+    }
+    uchar get0() const
+    {
+        return vgetq_lane_u8(val, 0);
+    }
+
+    uint8x16_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(int8x16_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_s8(v);
+    }
+    schar get0() const
+    {
+        return vgetq_lane_s8(val, 0);
+    }
+
+    int8x16_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(uint16x8_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_u16(v);
+    }
+    ushort get0() const
+    {
+        return vgetq_lane_u16(val, 0);
+    }
+
+    uint16x8_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(int16x8_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_s16(v);
+    }
+    short get0() const
+    {
+        return vgetq_lane_s16(val, 0);
+    }
+
+    int16x8_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(uint32x4_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = vld1q_u32(v);
+    }
+    unsigned get0() const
+    {
+        return vgetq_lane_u32(val, 0);
+    }
+
+    uint32x4_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(int32x4_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = vld1q_s32(v);
+    }
+    int get0() const
+    {
+        return vgetq_lane_s32(val, 0);
+    }
+    int32x4_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(float32x4_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = vld1q_f32(v);
+    }
+    float get0() const
+    {
+        return vgetq_lane_f32(val, 0);
+    }
+    float32x4_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(uint64x2_t v) : val(v) {}
+    v_uint64x2(unsigned v0, unsigned v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = vld1q_u64(v);
+    }
+    uint64 get0() const
+    {
+        return vgetq_lane_u64(val, 0);
+    }
+    uint64x2_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(int64x2_t v) : val(v) {}
+    v_int64x2(int v0, int v1)
+    {
+        int64 v[] = {v0, v1};
+        val = vld1q_s64(v);
+    }
+    int64 get0() const
+    {
+        return vgetq_lane_s64(val, 0);
+    }
+    int64x2_t val;
+};
+
+#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
+inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
+
+OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
+
+#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = vqmov##op##_##wsuffix(a.val); \
+    vst1_##suffix(ptr, a1); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
+    hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+template<int n> inline \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
+    vst1_##suffix(ptr, a1); \
+}
+
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
+OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
+OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
+OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
+    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
+    res = vmlaq_lane_f32(res, m1.val, vl, 1);
+    res = vmlaq_lane_f32(res, m2.val, vh, 0);
+    res = vmlaq_lane_f32(res, m3.val, vh, 1);
+    return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
+
+inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    return v_float32x4(vmulq_f32(a.val, reciprocal));
+}
+inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    a.val = vmulq_f32(a.val, reciprocal);
+    return a;
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
+    d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
+    d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+    int32x4x2_t cd = vtrnq_s32(c, d);
+    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
+}
+
+#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
+    }
+
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
+
+#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
+}
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
+    float32x4_t e = vrsqrteq_f32(x1);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    return v_float32x4(vmulq_f32(x.val, e));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    float32x4_t e = vrsqrteq_f32(x.val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    return v_float32x4(e);
+}
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{ return v_float32x4(vabsq_f32(x.val)); }
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
+
+
+#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
+
+// TODO: absdiff for signed integers
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+}
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
+
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
+
+#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
+
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
+
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+inline int v_signmask(const v_uint8x16& a)
+{
+    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+    int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(v0);
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
+    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
+    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
+}
+
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
+
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
+    uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
+    return v_uint32x4(vmovl_u16(v1));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
+    int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
+    return v_int32x4(vmovl_s16(v1));
+}
+
+#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
+    b0.val = p.val[0]; \
+    b1.val = p.val[1]; \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
+    d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
+        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+
+    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
+    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
+    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
+    return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(vcvtq_s32_f32(a.val)); }
+
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* m00 m01 m02 m03 */ \
+    /* m10 m11 m12 m13 */ \
+    /* m20 m21 m22 m23 */ \
+    /* m30 m31 m32 m33 */ \
+    _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
+    _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
+    /* m00 m10 m02 m12 */ \
+    /* m01 m11 m03 m13 */ \
+    /* m20 m30 m22 m32 */ \
+    /* m21 m31 m23 m33 */ \
+    b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
+    b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
+    b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
+    b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
+
+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v = vld3q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v = vld4q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+    d.val = v.val[3]; \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    vst3q_##suffix(ptr, v); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                               const v_##_Tpvec& c, const v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    v.val[3] = d.val; \
+    vst4q_##suffix(ptr, v); \
+}
+
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vcvtq_f32_s32(a.val));
+}
+
+}
+
+#endif
--- a/modules/hal/include/opencv2/hal/intrin_sse.hpp
+++ b/modules/hal/include/opencv2/hal/intrin_sse.hpp
--- a/modules/hal/src/arithm.cpp
+++ b/modules/hal/src/arithm.cpp
@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
--- a/modules/hal/src/color.cpp
+++ b/modules/hal/src/color.cpp
@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
--- a/modules/hal/src/filter.cpp
+++ b/modules/hal/src/filter.cpp
@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
--- a/modules/hal/src/mathfuncs.cpp
+++ b/modules/hal/src/mathfuncs.cpp
--- a/modules/hal/src/matrix.cpp
+++ b/modules/hal/src/matrix.cpp
@ -0,0 +1,208 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+/****************************************************************************************\
+*                     LU & Cholesky implementation for small matrices                    *
+\****************************************************************************************/
+
+template<typename _Tp> static inline int
+LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
+{
+    int i, j, k, p = 1;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        k = i;
+
+        for( j = i+1; j < m; j++ )
+            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
+                k = j;
+
+        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
+            return 0;
+
+        if( k != i )
+        {
+            for( j = i; j < m; j++ )
+                std::swap(A[i*astep + j], A[k*astep + j]);
+            if( b )
+                for( j = 0; j < n; j++ )
+                    std::swap(b[i*bstep + j], b[k*bstep + j]);
+            p = -p;
+        }
+
+        _Tp d = -1/A[i*astep + i];
+
+        for( j = i+1; j < m; j++ )
+        {
+            _Tp alpha = A[j*astep + i]*d;
+
+            for( k = i+1; k < m; k++ )
+                A[j*astep + k] += alpha*A[i*astep + k];
+
+            if( b )
+                for( k = 0; k < n; k++ )
+                    b[j*bstep + k] += alpha*b[i*bstep + k];
+        }
+
+        A[i*astep + i] = -d;
+    }
+
+    if( b )
+    {
+        for( i = m-1; i >= 0; i-- )
+            for( j = 0; j < n; j++ )
+            {
+                _Tp s = b[i*bstep + j];
+                for( k = i+1; k < m; k++ )
+                    s -= A[i*astep + k]*b[k*bstep + j];
+                b[i*bstep + j] = s*A[i*astep + i];
+            }
+    }
+
+    return p;
+}
+
+
+int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n);
+}
+
+
+int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n);
+}
+
+
+template<typename _Tp> static inline bool
+CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
+{
+    _Tp* L = A;
+    int i, j, k;
+    double s;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < i; j++ )
+        {
+            s = A[i*astep + j];
+            for( k = 0; k < j; k++ )
+                s -= L[i*astep + k]*L[j*astep + k];
+            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
+        }
+        s = A[i*astep + i];
+        for( k = 0; k < j; k++ )
+        {
+            double t = L[i*astep + k];
+            s -= t*t;
+        }
+        if( s < std::numeric_limits<_Tp>::epsilon() )
+            return false;
+        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
+    }
+
+    if( !b )
+        return true;
+
+    // LLt x = b
+    // 1: L y = b
+    // 2. Lt x = y
+
+    /*
+     [ L00             ]  y0   b0
+     [ L10 L11         ]  y1 = b1
+     [ L20 L21 L22     ]  y2   b2
+     [ L30 L31 L32 L33 ]  y3   b3
+
+     [ L00 L10 L20 L30 ]  x0   y0
+     [     L11 L21 L31 ]  x1 = y1
+     [         L22 L32 ]  x2   y2
+     [             L33 ]  x3   y3
+     */
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = 0; k < i; k++ )
+                s -= L[i*astep + k]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    for( i = m-1; i >= 0; i-- )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = m-1; k > i; k-- )
+                s -= L[k*astep + i]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    return true;
+}
+
+
+bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+}}
--- a/modules/hal/src/precomp.hpp
+++ b/modules/hal/src/precomp.hpp
@ -0,0 +1,49 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/hal.hpp"
+#include "opencv2/hal/intrin.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <limits>
+#include <float.h>
--- a/modules/hal/src/resize.cpp
+++ b/modules/hal/src/resize.cpp
@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
--- a/modules/hal/src/stat.cpp
+++ b/modules/hal/src/stat.cpp
@ -0,0 +1,306 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+static const uchar popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+
+static const uchar popCountTable2[] =
+{
+    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+};
+
+static const uchar popCountTable4[] =
+{
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+int normHamming(const uchar* a, int n)
+{
+    int i = 0;
+    int result = 0;
+#if CV_NEON
+    {
+        uint32x4_t bits = vmovq_n_u32(0);
+        for (; i <= n - 16; i += 16) {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t bitsSet = vcntq_u8 (A_vec);
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            bits = vaddq_u32(bits, bitSet4);
+        }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+    }
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i]] + popCountTable[a[i+1]] +
+            popCountTable[a[i+2]] + popCountTable[a[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, const uchar* b, int n)
+{
+    int i = 0;
+    int result = 0;
+#if CV_NEON
+    {
+        uint32x4_t bits = vmovq_n_u32(0);
+        for (; i <= n - 16; i += 16) {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t B_vec = vld1q_u8 (b + i);
+            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+            uint8x16_t bitsSet = vcntq_u8 (AxorB);
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            bits = vaddq_u32(bits, bitSet4);
+        }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+    }
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
+                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i] ^ b[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        return -1;
+    int i = 0;
+    int result = 0;
+#if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
+#endif
+    for( ; i < n; i++ )
+        result += tab[a[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, b, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        return -1;
+    int i = 0;
+    int result = 0;
+    #if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
+                tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
+    #endif
+    for( ; i < n; i++ )
+        result += tab[a[i] ^ b[i]];
+    return result;
+}
+
+float normL2Sqr_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
+        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
+            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
+        }
+    }
+
+    for( ; j < n; j++ )
+    {
+        float t = a[j] - b[j];
+        d += t*t;
+    }
+    return d;
+}
+
+
+float normL1_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+    __m128 absmask = _mm_load_ps((const float*)absbuf);
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
+        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#elif CV_NEON
+    float32x4_t v_sum = vdupq_n_f32(0.0f);
+    for ( ; j <= n - 4; j += 4)
+        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
+
+    float CV_DECL_ALIGNED(16) buf[4];
+    vst1q_f32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+int normL1_(const uchar* a, const uchar* b, int n)
+{
+    int j = 0, d = 0;
+#if CV_SSE
+    __m128i d0 = _mm_setzero_si128();
+
+    for( ; j <= n - 16; j += 16 )
+    {
+        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
+        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+
+    for( ; j <= n - 4; j += 4 )
+    {
+        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
+        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
+#elif CV_NEON
+    uint32x4_t v_sum = vdupq_n_u32(0.0f);
+    for ( ; j <= n - 16; j += 16)
+    {
+        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
+        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
+    }
+
+    uint CV_DECL_ALIGNED(16) buf[4];
+    vst1q_u32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+}} //cv::hal
--- a/modules/hal/src/warp.cpp
+++ b/modules/hal/src/warp.cpp
@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+}}
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@ -117,7 +117,7 @@ CV_IMPL void cvAddText(const CvArr* img, const char* text, CvPoint org, CvFont*
        "putText",
        autoBlockingConnection(),
        Q_ARG(void*, (void*) img),
-        Q_ARG(QString,QString(text)),
+        Q_ARG(QString,QString::fromUtf8(text)),
        Q_ARG(QPoint, QPoint(org.x,org.y)),
        Q_ARG(void*,(void*) font));
 }
@ -418,12 +418,14 @@ static CvBar* icvFindBarByName(QBoxLayout* layout, QString name_bar, typeBar typ
 static CvTrackbar* icvFindTrackBarByName(const char* name_trackbar, const char* name_window, QBoxLayout* layout = NULL)
 {
    QString nameQt(name_trackbar);
-    if ((!name_window || !name_window[0]) && global_control_panel) //window name is null and we have a control panel
+    QString nameWinQt(name_window);
+
+    if (nameWinQt.isEmpty() && global_control_panel) //window name is null and we have a control panel
        layout = global_control_panel->myLayout;

    if (!layout)
    {
-        QPointer<CvWindow> w = icvFindWindowByName(QLatin1String(name_window));
+        QPointer<CvWindow> w = icvFindWindowByName(nameWinQt);

        if (!w)
            CV_Error(CV_StsNullPtr, "NULL window handler");
@ -1875,7 +1877,7 @@ bool CvWindow::isOpenGl()

 void CvWindow::setViewportSize(QSize _size)
 {
-    myView->getWidget()->resize(_size);
+    resize(_size);
    myView->setSize(_size);
 }

--- a/modules/imgcodecs/CMakeLists.txt
+++ b/modules/imgcodecs/CMakeLists.txt
@ -1,7 +1,3 @@
-if(WINRT)
-  ocv_module_disable(imgcodecs)
-endif()
-
 set(the_description "Image codecs")
 ocv_add_module(imgcodecs opencv_imgproc WRAP java python)

--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@ -185,13 +185,14 @@ compression parameters :

    void createAlphaMat(Mat &mat)
    {
+        CV_Assert(mat.channels() == 4);
        for (int i = 0; i < mat.rows; ++i) {
            for (int j = 0; j < mat.cols; ++j) {
-                Vec4b& rgba = mat.at<Vec4b>(i, j);
-                rgba[0] = UCHAR_MAX;
-                rgba[1] = saturate_cast<uchar>((float (mat.cols - j)) / ((float)mat.cols) * UCHAR_MAX);
-                rgba[2] = saturate_cast<uchar>((float (mat.rows - i)) / ((float)mat.rows) * UCHAR_MAX);
-                rgba[3] = saturate_cast<uchar>(0.5 * (rgba[1] + rgba[2]));
+                Vec4b& bgra = mat.at<Vec4b>(i, j);
+                bgra[0] = UCHAR_MAX; // Blue
+                bgra[1] = saturate_cast<uchar>((float (mat.cols - j)) / ((float)mat.cols) * UCHAR_MAX); // Green
+                bgra[2] = saturate_cast<uchar>((float (mat.rows - i)) / ((float)mat.rows) * UCHAR_MAX); // Red
+                bgra[3] = saturate_cast<uchar>(0.5 * (bgra[1] + bgra[2])); // Alpha
            }
        }
    }
--- a/modules/imgcodecs/src/grfmt_jpeg2000.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg2000.cpp
@ -45,6 +45,7 @@
 #ifdef HAVE_JASPER

 #include "grfmt_jpeg2000.hpp"
+#include "opencv2/imgproc.hpp"

 #ifdef WIN32
 #define JAS_WIN_MSVC_BUILD 1
@ -159,6 +160,21 @@ bool  Jpeg2KDecoder::readData( Mat& img )
    jas_stream_t* stream = (jas_stream_t*)m_stream;
    jas_image_t* image = (jas_image_t*)m_image;

+#ifndef WIN32
+    // At least on some Linux instances the
+    // system libjasper segfaults when
+    // converting color to grey.
+    // We do this conversion manually at the end.
+    Mat clr;
+    if (CV_MAT_CN(img.type()) < CV_MAT_CN(this->type()))
+    {
+        clr.create(img.size().height, img.size().width, this->type());
+        color = true;
+        data = clr.ptr();
+        step = (int)clr.step;
+    }
+#endif
+
    if( stream && image )
    {
        bool convert;
@ -171,7 +187,7 @@ bool  Jpeg2KDecoder::readData( Mat& img )
        else
        {
            convert = (jas_clrspc_fam( jas_image_clrspc( image ) ) != JAS_CLRSPC_FAM_GRAY);
-            colorspace = JAS_CLRSPC_SGRAY; // TODO GENGRAY or SGRAY?
+            colorspace = JAS_CLRSPC_SGRAY; // TODO GENGRAY or SGRAY? (GENGRAY fails on Win.)
        }

        // convert to the desired colorspace
@ -256,6 +272,13 @@ bool  Jpeg2KDecoder::readData( Mat& img )

    close();

+#ifndef WIN32
+    if (!clr.empty())
+    {
+        cv::cvtColor(clr, img, COLOR_BGR2GRAY);
+    }
+#endif
+
    return result;
 }

--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@ -374,15 +374,8 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
                type = CV_MAKETYPE(CV_MAT_DEPTH(type), 1);
        }

-        // established the required input image size.
-        CvSize size;
-        size.width = decoder->width();
-        size.height = decoder->height();
-
-        Mat mat;
-        mat.create(size.height, size.width, type);
-
        // read the image data
+        Mat mat(decoder->height(), decoder->width(), type);
        if (!decoder->readData(mat))
        {
            break;
--- a/modules/imgcodecs/test/test_drawing.cpp
+++ b/modules/imgcodecs/test/test_drawing.cpp
@ -448,3 +448,81 @@ protected:
 };

 TEST(Imgcodecs_Drawing, fillconvexpoly_clipping) { CV_FillConvexPolyTest test; test.safe_run(); }
+
+class CV_DrawingTest_UTF8 : public cvtest::BaseTest
+{
+public:
+    CV_DrawingTest_UTF8() {}
+    ~CV_DrawingTest_UTF8() {}
+protected:
+    void run(int)
+    {
+        vector<string> lines;
+        lines.push_back("abcdefghijklmnopqrstuvwxyz1234567890");
+        // cyrillic letters small
+        lines.push_back("\xD0\xB0\xD0\xB1\xD0\xB2\xD0\xB3\xD0\xB4\xD0\xB5\xD1\x91\xD0\xB6\xD0\xB7"
+                        "\xD0\xB8\xD0\xB9\xD0\xBA\xD0\xBB\xD0\xBC\xD0\xBD\xD0\xBE\xD0\xBF\xD1\x80"
+                        "\xD1\x81\xD1\x82\xD1\x83\xD1\x84\xD1\x85\xD1\x86\xD1\x87\xD1\x88\xD1\x89"
+                        "\xD1\x8A\xD1\x8B\xD1\x8C\xD1\x8D\xD1\x8E\xD1\x8F");
+        // cyrillic letters capital
+        lines.push_back("\xD0\x90\xD0\x91\xD0\x92\xD0\x93\xD0\x94\xD0\x95\xD0\x81\xD0\x96\xD0\x97"
+                        "\xD0\x98\xD0\x99\xD0\x9A\xD0\x9B\xD0\x9C\xD0\x9D\xD0\x9E\xD0\x9F\xD0\xA0"
+                        "\xD0\xA1\xD0\xA2\xD0\xA3\xD0\xA4\xD0\xA5\xD0\xA6\xD0\xA7\xD0\xA8\xD0\xA9"
+                        "\xD0\xAA\xD0\xAB\xD0\xAC\xD0\xAD\xD0\xAE\xD0\xAF");
+        // bounds
+        lines.push_back("-\xD0\x80-\xD0\x8E-\xD0\x8F-");
+        lines.push_back("-\xD1\x90-\xD1\x91-\xD1\xBF-");
+        // bad utf8
+        lines.push_back("-\x81-\x82-\x83-");
+        lines.push_back("--\xF0--");
+        lines.push_back("-\xF0");
+
+        vector<int> fonts;
+        fonts.push_back(FONT_HERSHEY_SIMPLEX);
+        fonts.push_back(FONT_HERSHEY_PLAIN);
+        fonts.push_back(FONT_HERSHEY_DUPLEX);
+        fonts.push_back(FONT_HERSHEY_COMPLEX);
+        fonts.push_back(FONT_HERSHEY_TRIPLEX);
+        fonts.push_back(FONT_HERSHEY_COMPLEX_SMALL);
+        fonts.push_back(FONT_HERSHEY_SCRIPT_SIMPLEX);
+        fonts.push_back(FONT_HERSHEY_SCRIPT_COMPLEX);
+
+        vector<Mat> results;
+        Size bigSize(0, 0);
+        for (vector<int>::const_iterator font = fonts.begin(); font != fonts.end(); ++font)
+        {
+            for (int italic = 0; italic <= FONT_ITALIC; italic += FONT_ITALIC)
+            {
+                for (vector<string>::const_iterator line = lines.begin(); line != lines.end(); ++line)
+                {
+                    const float fontScale = 1;
+                    const int thickness = 1;
+                    const Scalar color(20,20,20);
+                    int baseline = 0;
+
+                    Size textSize = getTextSize(*line, *font | italic, fontScale, thickness, &baseline);
+                    Point textOrg(0, textSize.height + 2);
+                    Mat img(textSize + Size(0, baseline), CV_8UC3, Scalar(255, 255, 255));
+                    putText(img, *line, textOrg, *font | italic, fontScale, color, thickness, CV_AA);
+
+                    results.push_back(img);
+                    bigSize.width = max(bigSize.width, img.size().width);
+                    bigSize.height += img.size().height + 1;
+                }
+            }
+        }
+
+        int shift = 0;
+        Mat result(bigSize, CV_8UC3, Scalar(100, 100, 100));
+        for (vector<Mat>::const_iterator img = results.begin(); img != results.end(); ++img)
+        {
+            Rect roi(Point(0, shift), img->size());
+            Mat sub(result, roi);
+            img->copyTo(sub);
+            shift += img->size().height + 1;
+        }
+        imwrite("/tmp/all_fonts.png", result);
+    }
+};
+
+TEST(Highgui_Drawing, utf8_support) { CV_DrawingTest_UTF8 test; test.safe_run(); }
--- a/modules/imgcodecs/test/test_grfmt.cpp
+++ b/modules/imgcodecs/test/test_grfmt.cpp
@ -87,6 +87,9 @@ TEST(Imgcodecs_imread, regression)
 {
    const char* const filenames[] =
    {
+#ifdef HAVE_JASPER
+        "Rome.jp2",
+#endif
        "color_palette_alpha.png",
        "multipage.tif",
        "rle.hdr",
@ -99,16 +102,32 @@ TEST(Imgcodecs_imread, regression)

    for (size_t i = 0; i < sizeof(filenames) / sizeof(filenames[0]); ++i)
    {
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_UNCHANGED));
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_GRAYSCALE));
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_COLOR));
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_ANYDEPTH));
-        ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_ANYCOLOR));
-        if (i != 2) // GDAL does not support hdr
-            ASSERT_TRUE(imread_compare(folder + string(filenames[i]), IMREAD_LOAD_GDAL));
+        const string path = folder + string(filenames[i]);
+        ASSERT_TRUE(imread_compare(path, IMREAD_UNCHANGED));
+        ASSERT_TRUE(imread_compare(path, IMREAD_GRAYSCALE));
+        ASSERT_TRUE(imread_compare(path, IMREAD_COLOR));
+        ASSERT_TRUE(imread_compare(path, IMREAD_ANYDEPTH));
+        ASSERT_TRUE(imread_compare(path, IMREAD_ANYCOLOR));
+        if (path.substr(path.length() - 3) != "hdr")
+        {
+            // GDAL does not support hdr
+            ASSERT_TRUE(imread_compare(path, IMREAD_LOAD_GDAL));
+        }
    }
 }

+#ifdef HAVE_JASPER
+TEST(Imgcodecs_jasper, regression)
+{
+    const string folder = string(cvtest::TS::ptr()->get_data_path()) + "/readwrite/";
+
+    ASSERT_TRUE(imread_compare(folder + "Bretagne2.jp2", IMREAD_COLOR));
+    ASSERT_TRUE(imread_compare(folder + "Bretagne2.jp2", IMREAD_GRAYSCALE));
+    ASSERT_TRUE(imread_compare(folder + "Grey.jp2", IMREAD_COLOR));
+    ASSERT_TRUE(imread_compare(folder + "Grey.jp2", IMREAD_GRAYSCALE));
+}
+#endif
+
 class CV_GrfmtWriteBigImageTest : public cvtest::BaseTest
 {
 public:
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -3494,7 +3494,7 @@ CV_EXPORTS_W double contourArea( InputArray contour, bool oriented = false );

 The function calculates and returns the minimum-area bounding rectangle (possibly rotated) for a
 specified point set. See the OpenCV sample minarea.cpp . Developer should keep in mind that the
-returned rotatedRect can contain negative indices when data is close the the containing Mat element
+returned rotatedRect can contain negative indices when data is close to the containing Mat element
 boundary.

@param points Input vector of 2D points, stored in std::vector\<\> or Mat
--- a/Show More
+++ b/Show More