Merge branch 'master' into better_png_transparency

* master: (468 commits) added suppression for TBB valgrind issue update CUDA architecture flags initialization increase minimal supported CUDA toolkit to 6.5 check the CPU flag correctly opencv_visualization: check cmdline args provide better error messages stop search of markers in Exif reader to prevent infinite loop Fix calibration fail on python with CALIB_THIN_PRISM_MODEL flag clarify CUDA arithm operations usage with mask fixed empty image condition in resize fixed memory leak in flann tests fisheye: add CALIB_FIX_PRINCIPAL_POINT get/put: more type-safety and code unification using templates py_tutorials: fix cv2.findContours return val imgproc: speed up threshold of 64F version using NEON and SSE * use NEON under aarch64 only * check 64F version correctly bigdata: add test, resolve split/merge issue Improved Carotene library linear resize evaluation precision and enabled it as HAL implementation. persistence: fixing crash with space-only values Removed unnecessary check for Android API level and unused flags. Fix for median blur of 2-channel images ...
2016-07-14 14:05:16 +02:00 · 2016-07-14 14:05:16 +02:00 · d40e46bc9b
commit d40e46bc9b
parent bdb9cf4d47 bb41df9d5f
597 changed files with 62455 additions and 8567 deletions
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -0,0 +1,30 @@
+<!--
+If you have a question rather than reporting a bug please go to http://answers.opencv.org where you get much faster responses.
+If you need further assistance please read [How To Contribute](https://github.com/Itseez/opencv/wiki/How_to_contribute).
+
+This is a template helping you to create an issue which can be processed as quickly as possible. This is the bug reporting section for the OpenCV library.
+-->
+
+##### System information (version)
+<!-- Example
+- OpenCV => 3.1
+- Operating System / Platform => Windows 64 Bit
+- Compiler => Visual Studio 2015
+-->
+
+- OpenCV => :grey_question:
+- Operating System / Platform => :grey_question:
+- Compiler => :grey_question:
+
+##### Detailed description
+
+<!-- your description -->
+
+##### Steps to reproduce
+
+<!-- to add code example fence it with triple backticks and optional file extension
+    ```.cpp
+    // C++ code example
+    ```
+ or attach as .txt or .zip file
+-->
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -0,0 +1,9 @@
+<!-- Please use this line to close one or multiple issues when this pullrequest gets merged
+You can add another line right under the first one:
+resolves #1234
+resolves #1235
+-->
+
+### This pullrequest changes
+
+<!-- Please describe what your pullrequest is changing -->
--- a/3rdparty/carotene/.gitignore
+++ b/3rdparty/carotene/.gitignore
@ -0,0 +1,8 @@
+# Gedit temp files
+*~
+
+# Qt Creator file
+*.user
+
+# MacOS-specific (Desktop Services Store)
+.DS_Store
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@ -0,0 +1,42 @@
+cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
+
+project(Carotene)
+
+set(CAROTENE_NS "carotene" CACHE STRING "Namespace for Carotene definitions")
+
+set(CAROTENE_INCLUDE_DIR include)
+set(CAROTENE_SOURCE_DIR src)
+
+file(GLOB_RECURSE carotene_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_INCLUDE_DIR}/*.hpp")
+file(GLOB_RECURSE carotene_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_SOURCE_DIR}/*.cpp"
+                                                                        "${CAROTENE_SOURCE_DIR}/*.hpp")
+
+include_directories(${CAROTENE_INCLUDE_DIR})
+
+if(CMAKE_COMPILER_IS_GNUCC)
+    set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}")
+
+    # allow more inlines - these parameters improve performance for:
+    # - matchTemplate about 5-10%
+    # - goodFeaturesToTrack 10-20%
+    # - cornerHarris 30% for some cases
+
+    set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
+endif()
+
+add_library(carotene_objs OBJECT
+  ${carotene_headers}
+  ${carotene_sources}
+)
+
+if(NOT CAROTENE_NS STREQUAL "carotene")
+    target_compile_definitions(carotene_objs PUBLIC "-DCAROTENE_NS=${CAROTENE_NS}")
+endif()
+
+if(WITH_NEON)
+    target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
+endif()
+
+set_target_properties(carotene_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+
+add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>")
--- a/3rdparty/carotene/README.md
+++ b/3rdparty/carotene/README.md
@ -0,0 +1,2 @@
+This is Carotene, a low-level library containing optimized CPU routines
+that are useful for computer vision algorithms.
--- a/3rdparty/carotene/hal/CMakeLists.txt
+++ b/3rdparty/carotene/hal/CMakeLists.txt
@ -0,0 +1,112 @@
+cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(TEGRA_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(CAROTENE_DIR "${TEGRA_HAL_DIR}/../")
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
+  set(ARM TRUE)
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|AARCH64.*")
+  set(AARCH64 TRUE)
+endif()
+
+set(TEGRA_COMPILER_FLAGS "")
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  # Generate unwind information even for functions that can't throw/propagate exceptions.
+  # This lets debuggers and such get non-broken backtraces for such functions, even without debugging symbols.
+  list(APPEND TEGRA_COMPILER_FLAGS -funwind-tables)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  if(X86 OR ARMEABI_V6 OR (MIPS AND ANDROID_COMPILER_VERSION VERSION_LESS "4.6"))
+    list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
+  else()
+    list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched2-use-superblocks -fsched2-use-traces
+                                     -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
+  endif()
+  if((ANDROID_COMPILER_IS_CLANG OR NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.7") AND ANDROID_NDK_RELEASE STRGREATER "r8d" )
+    list(APPEND TEGRA_COMPILER_FLAGS -fgraphite -fgraphite-identity -floop-block -floop-flatten -floop-interchange
+                                     -floop-strip-mine -floop-parallelize-all -ftree-loop-linear)
+  endif()
+endif()
+
+string(REPLACE ";" " " TEGRA_COMPILER_FLAGS "${TEGRA_COMPILER_FLAGS}")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TEGRA_COMPILER_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TEGRA_COMPILER_FLAGS}")
+
+if(ARMEABI_V7A)
+  if (CMAKE_COMPILER_IS_GNUCXX)
+    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize" )
+    set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize" )
+  endif()
+endif()
+
+if(WITH_LOGS)
+  add_definitions(-DHAVE_LOGS)
+endif()
+
+set(CAROTENE_NS "carotene_o4t" CACHE STRING "" FORCE)
+
+function(compile_carotene)
+  if(ENABLE_NEON)
+    set(WITH_NEON ON)
+  endif()
+
+  add_subdirectory("${CAROTENE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/carotene")
+
+  if(ARM OR AARCH64)
+    if(CMAKE_BUILD_TYPE)
+      set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
+    endif()
+    check_cxx_compiler_flag("-mfpu=neon" CXX_HAS_MFPU_NEON)
+    check_c_compiler_flag("-mfpu=neon" C_HAS_MFPU_NEON)
+    if(${CXX_HAS_MFPU_NEON} AND ${C_HAS_MFPU_NEON})
+      get_target_property(old_flags "carotene_objs" COMPILE_FLAGS)
+      if(old_flags)
+        set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "${old_flags} -mfpu=neon")
+      else()
+        set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "-mfpu=neon")
+      endif()
+    endif()
+  endif()
+endfunction()
+
+compile_carotene()
+
+include_directories("${CAROTENE_DIR}/include")
+
+get_target_property(carotene_defs carotene_objs INTERFACE_COMPILE_DEFINITIONS)
+set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
+
+  if (CMAKE_COMPILER_IS_GNUCXX)
+    # allow more inlines - these parameters improve performance for:
+    #   matchTemplate about 5-10%
+    #   goodFeaturesToTrack 10-20%
+    #   cornerHarris 30% for some cases
+    set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
+#    set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
+  endif()
+
+add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs>)
+set_target_properties(tegra_hal PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(tegra_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+target_include_directories(tegra_hal PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include)
+
+set(CAROTENE_HAL_VERSION "0.0.1" PARENT_SCOPE)
+set(CAROTENE_HAL_LIBRARIES "tegra_hal" PARENT_SCOPE)
+set(CAROTENE_HAL_HEADERS "carotene/tegra_hal.hpp" PARENT_SCOPE)
+set(CAROTENE_HAL_INCLUDE_DIRS "${CMAKE_BINARY_DIR}" PARENT_SCOPE)
+
+configure_file("tegra_hal.hpp" "${CMAKE_BINARY_DIR}/carotene/tegra_hal.hpp" COPYONLY)
+configure_file("${CAROTENE_DIR}/include/carotene/definitions.hpp" "${CMAKE_BINARY_DIR}/carotene/definitions.hpp" COPYONLY)
+configure_file("${CAROTENE_DIR}/include/carotene/functions.hpp" "${CMAKE_BINARY_DIR}/carotene/functions.hpp" COPYONLY)
+configure_file("${CAROTENE_DIR}/include/carotene/types.hpp" "${CMAKE_BINARY_DIR}/carotene/types.hpp" COPYONLY)
--- a/3rdparty/carotene/hal/tegra_hal.hpp
+++ b/3rdparty/carotene/hal/tegra_hal.hpp
--- a/3rdparty/carotene/include/carotene/definitions.hpp
+++ b/3rdparty/carotene/include/carotene/definitions.hpp
@ -0,0 +1,47 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_DEFINITIONS_HPP
+#define CAROTENE_DEFINITIONS_HPP
+
+#ifndef CAROTENE_NS
+#define CAROTENE_NS carotene
+#endif
+
+#endif
--- a/3rdparty/carotene/include/carotene/functions.hpp
+++ b/3rdparty/carotene/include/carotene/functions.hpp
--- a/3rdparty/carotene/include/carotene/types.hpp
+++ b/3rdparty/carotene/include/carotene/types.hpp
@ -0,0 +1,125 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_TYPES_HPP
+#define CAROTENE_TYPES_HPP
+
+#include <carotene/definitions.hpp>
+#include <stdint.h>
+#include <cstddef>
+
+#ifndef UINT32_MAX
+    #define UINT32_MAX (4294967295U)
+#endif
+
+namespace CAROTENE_NS {
+    using std::size_t;
+    using std::ptrdiff_t;
+
+    typedef int8_t   s8;
+    typedef uint8_t  u8;
+    typedef int16_t  s16;
+    typedef uint16_t u16;
+    typedef int32_t  s32;
+    typedef uint32_t u32;
+    typedef float    f32;
+    typedef int64_t  s64;
+    typedef uint64_t u64;
+    typedef double   f64;
+
+    typedef ptrdiff_t  stride_t;
+
+    enum CONVERT_POLICY
+    {
+        CONVERT_POLICY_WRAP,
+        CONVERT_POLICY_SATURATE
+    };
+
+    enum BORDER_MODE
+    {
+        BORDER_MODE_UNDEFINED,
+        BORDER_MODE_CONSTANT,
+        BORDER_MODE_REPLICATE,
+        BORDER_MODE_REFLECT,
+        BORDER_MODE_REFLECT101,
+        BORDER_MODE_WRAP
+    };
+
+    enum FLIP_MODE
+    {
+        FLIP_HORIZONTAL_MODE = 1,
+        FLIP_VERTICAL_MODE = 2,
+        FLIP_BOTH_MODE = FLIP_HORIZONTAL_MODE | FLIP_VERTICAL_MODE
+    };
+
+    enum COLOR_SPACE
+    {
+        COLOR_SPACE_BT601,
+        COLOR_SPACE_BT709
+    };
+
+    struct Size2D {
+        Size2D() : width(0), height(0) {}
+        Size2D(size_t width_, size_t height_) : width(width_), height(height_) {}
+
+        size_t width;
+        size_t height;
+
+        inline size_t total() const
+        {
+            return width * height;
+        }
+    };
+
+    struct Margin {
+        Margin() : left(0), right(0), top(0), bottom(0) {}
+        Margin(size_t left_, size_t right_, size_t top_, size_t bottom_)
+            : left(left_), right(right_), top(top_), bottom(bottom_) {}
+
+        // these are measured in elements
+        size_t left, right, top, bottom;
+    };
+
+    struct KeypointStore {
+        virtual void push(f32 kpX, f32 kpY, f32 kpSize, f32 kpAngle=-1, f32 kpResponse=0, s32 kpOctave=0, s32 kpClass_id=-1) = 0;
+        virtual ~KeypointStore() {};
+    };
+}
+
+#endif
--- a/3rdparty/carotene/src/absdiff.cpp
+++ b/3rdparty/carotene/src/absdiff.cpp
@ -0,0 +1,241 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <algorithm>
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+struct AbsDiff
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vabdq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vabd(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = src0[0] >= src1[0] ? src0[0] - src1[0] : src1[0] - src0[0];
+    }
+};
+
+template <typename T>
+struct AbsDiffSigned
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        typename internal::VecTraits<T>::vec128 v_min = internal::vminq(v_src0, v_src1);
+        typename internal::VecTraits<T>::vec128 v_max = internal::vmaxq(v_src0, v_src1);
+        v_dst = internal::vqsubq(v_max, v_min);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        typename internal::VecTraits<T>::vec64 v_min = internal::vmin(v_src0, v_src1);
+        typename internal::VecTraits<T>::vec64 v_max = internal::vmax(v_src0, v_src1);
+        v_dst = internal::vqsub(v_max, v_min);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = internal::saturate_cast<T>(src0[0] >= src1[0] ? (s64)src0[0] - src1[0] : (s64)src1[0] - src0[0]);
+    }
+};
+
+} // namespace
+
+#endif
+
+void absDiff(const Size2D &size,
+             const u8 *src0Base, ptrdiff_t src0Stride,
+             const u8 *src1Base, ptrdiff_t src1Stride,
+             u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiff<u8>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const u16 *src0Base, ptrdiff_t src0Stride,
+             const u16 *src1Base, ptrdiff_t src1Stride,
+             u16 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiff<u16>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const s8 *src0Base, ptrdiff_t src0Stride,
+             const s8 *src1Base, ptrdiff_t src1Stride,
+             s8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiffSigned<s8>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const s16 *src0Base, ptrdiff_t src0Stride,
+             const s16 *src1Base, ptrdiff_t src1Stride,
+             s16 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiffSigned<s16>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const s32 *src0Base, ptrdiff_t src0Stride,
+             const s32 *src1Base, ptrdiff_t src1Stride,
+             s32 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiffSigned<s32>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void absDiff(const Size2D &size,
+             const f32 * src0Base, ptrdiff_t src0Stride,
+             const f32 * src1Base, ptrdiff_t src1Stride,
+             f32 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, AbsDiff<f32>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/accumulate.cpp
+++ b/3rdparty/carotene/src/accumulate.cpp
@ -0,0 +1,408 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+void accumulate(const Size2D &size,
+                const u8 *srcBase, ptrdiff_t srcStride,
+                s16 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            internal::prefetch(dst + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vld1q_s16(dst + j);
+            int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
+            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+            v_dst0 = vqaddq_s16(v_dst0, v_src0);
+            v_dst1 = vqaddq_s16(v_dst1, v_src1);
+            vst1q_s16(dst + j, v_dst0);
+            vst1q_s16(dst + j + 8, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src = vld1_u8(src + j);
+            int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
+            int16x8_t v_dst = vld1q_s16(dst + j);
+            v_dst = vqaddq_s16(v_dst, v_src16);
+            vst1q_s16(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+            dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <int shift>
+void accumulateSquareConst(const Size2D &size,
+                           const u8 *srcBase, ptrdiff_t srcStride,
+                           s16 *dstBase, ptrdiff_t dstStride)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            internal::prefetch(dst + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
+            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+
+            int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
+            v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
+                                  vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
+
+            v_srclo = vget_low_s16(v_src1);
+            v_srchi = vget_high_s16(v_src1);
+            v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
+                                  vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
+
+            vst1q_s16(dst + j, v_dst0);
+            vst1q_s16(dst + j + 8, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
+            int16x8_t v_dst = vld1q_s16(dst + j);
+            int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
+            v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
+                                 vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
+            vst1q_s16(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+        {
+            s32 srcVal = src[j];
+            dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
+        }
+    }
+}
+
+template <>
+void accumulateSquareConst<0>(const Size2D &size,
+                              const u8 *srcBase, ptrdiff_t srcStride,
+                              s16 *dstBase, ptrdiff_t dstStride)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        s16* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            internal::prefetch(dst + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
+            int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+
+            int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
+            v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
+                                  vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
+
+            v_srclo = vget_low_s16(v_src1);
+            v_srchi = vget_high_s16(v_src1);
+            v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
+                                  vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
+
+            vst1q_s16(dst + j, v_dst0);
+            vst1q_s16(dst + j + 8, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
+            int16x8_t v_dst = vld1q_s16(dst + j);
+            int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
+            v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
+                                 vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
+            vst1q_s16(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+        {
+            s32 srcVal = src[j];
+            dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
+        }
+    }
+}
+
+typedef void (* accumulateSquareConstFunc)(const Size2D &size,
+                                           const u8 *srcBase, ptrdiff_t srcStride,
+                                           s16 *dstBase, ptrdiff_t dstStride);
+
+} // namespace
+
+#endif
+
+void accumulateSquare(const Size2D &size,
+                      const u8 *srcBase, ptrdiff_t srcStride,
+                      s16 *dstBase, ptrdiff_t dstStride,
+                      u32 shift)
+{
+    if (shift >= 16)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            std::memset(dst, 0, sizeof(s16) * size.width);
+        }
+        return;
+    }
+
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    // this ugly contruction is needed to avoid:
+    // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
+    // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
+
+    accumulateSquareConstFunc funcs[16] =
+    {
+        accumulateSquareConst<0>,
+        accumulateSquareConst<1>,
+        accumulateSquareConst<2>,
+        accumulateSquareConst<3>,
+        accumulateSquareConst<4>,
+        accumulateSquareConst<5>,
+        accumulateSquareConst<6>,
+        accumulateSquareConst<7>,
+        accumulateSquareConst<8>,
+        accumulateSquareConst<9>,
+        accumulateSquareConst<10>,
+        accumulateSquareConst<11>,
+        accumulateSquareConst<12>,
+        accumulateSquareConst<13>,
+        accumulateSquareConst<14>,
+        accumulateSquareConst<15>
+    }, func = funcs[shift];
+
+    func(size, srcBase, srcStride, dstBase, dstStride);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)shift;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+struct AccumulateWeightedHalf
+{
+    typedef u8 type;
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        v_dst = vhaddq_u8(v_src0, v_src1);
+    }
+
+    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        v_dst = vhadd_u8(v_src0, v_src1);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
+    }
+};
+
+struct AccumulateWeighted
+{
+    typedef u8 type;
+
+    float alpha, beta;
+    float32x4_t v_alpha, v_beta;
+
+    explicit AccumulateWeighted(float _alpha) :
+        alpha(_alpha), beta(1 - _alpha)
+    {
+        v_alpha = vdupq_n_f32(alpha);
+        v_beta = vdupq_n_f32(beta);
+    }
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
+        uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
+        float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
+                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
+        float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
+                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
+        uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                         vmovn_u32(vcvtq_u32_f32(v_dst1f)));
+
+        v_src0_p = vmovl_u8(vget_high_u8(v_src0));
+        v_src1_p = vmovl_u8(vget_high_u8(v_src1));
+        v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
+                            v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
+        v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
+                            v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
+        uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                         vmovn_u32(vcvtq_u32_f32(v_dst1f)));
+
+        v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
+    }
+
+    void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
+
+        float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
+                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
+        float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
+                                        v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
+        uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
+                                        vmovn_u32(vcvtq_u32_f32(v_dst1f)));
+
+        v_dst = vmovn_u16(_v_dst);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = beta * src1[0] + alpha * src0[0];
+    }
+};
+
+} // namespace
+
+#endif
+
+void accumulateWeighted(const Size2D &size,
+                        const u8 *srcBase, ptrdiff_t srcStride,
+                        u8 *dstBase, ptrdiff_t dstStride,
+                        f32 alpha)
+{
+    if (alpha == 0.0f)
+        return;
+    if (alpha == 1.0f)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+            u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            std::memcpy(dst, src, sizeof(u8) * size.width);
+        }
+        return;
+    }
+
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    // in this case we can use the following scheme:
+    // dst[p] = (src[p] + dst[p]) >> 1
+    // which is faster
+    if (alpha == 0.5f)
+    {
+        internal::vtransform(size,
+                             srcBase, srcStride,
+                             dstBase, dstStride,
+                             dstBase, dstStride,
+                             AccumulateWeightedHalf());
+
+        return;
+    }
+
+    internal::vtransform(size,
+                     srcBase, srcStride,
+                     dstBase, dstStride,
+                     dstBase, dstStride,
+                     AccumulateWeighted(alpha));
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)alpha;
+#endif
+}
+
+} //namespace CAROTENE_NS
--- a/3rdparty/carotene/src/add.cpp
+++ b/3rdparty/carotene/src/add.cpp
@ -0,0 +1,475 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T, typename WT>
+struct AddWrap
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vaddq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vadd(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
+    }
+};
+
+template <typename T, typename WT>
+struct AddSaturate
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vqaddq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vqadd(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
+    }
+};
+
+} // namespace
+
+#endif
+
+void add(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         u8 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<u8, u16>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<u8, u16>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const s8 * src0Base, ptrdiff_t src0Stride,
+         const s8 * src1Base, ptrdiff_t src1Stride,
+         s8 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<s8, s16>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<s8, s16>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+            uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
+            uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
+            vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
+            vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
+            vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
+            vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src0 = vld1_u8(src0 + j);
+            uint8x8_t v_src1 = vld1_u8(src1 + j);
+            vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
+        }
+
+        for (; j < size.width; j++)
+            dst[j] = (u16)src0[j] + (u16)src1[j];
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void add(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (policy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+                int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
+                int16x8_t v_src1 = vld1q_s16(src1 + j);
+                int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
+        }
+        else
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+                int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
+                int16x8_t v_src1 = vld1q_s16(src1 + j);
+                int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+        if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<s16, s32>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<s16, s32>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const u16 * src0Base, ptrdiff_t src0Stride,
+         const u16 * src1Base, ptrdiff_t src1Stride,
+         u16 * dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+        if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<u16, u32>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<u16, u32>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const s32 * src0Base, ptrdiff_t src0Stride,
+         const s32 * src1Base, ptrdiff_t src1Stride,
+         s32 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+        if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<s32, s64>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<s32, s64>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const u32 * src0Base, ptrdiff_t src0Stride,
+         const u32 * src1Base, ptrdiff_t src1Stride,
+         u32 * dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+        if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddSaturate<u32, u64>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             AddWrap<u32, u64>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void add(const Size2D &size,
+         const f32 * src0Base, ptrdiff_t src0Stride,
+         const f32 * src1Base, ptrdiff_t src1Stride,
+         f32 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         AddWrap<f32, f32>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/add_weighted.cpp
+++ b/3rdparty/carotene/src/add_weighted.cpp
@ -0,0 +1,265 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+using namespace internal;
+
+template <typename T> struct TypeTraits;
+template <> struct TypeTraits< u8> { typedef u16 wide;                     typedef  u8 unsign; typedef  uint8x16_t vec128; };
+template <> struct TypeTraits< s8> { typedef s16 wide;                     typedef  u8 unsign; typedef   int8x16_t vec128; };
+template <> struct TypeTraits<u16> { typedef u32 wide; typedef  u8 narrow; typedef u16 unsign; typedef  uint16x8_t vec128; };
+template <> struct TypeTraits<s16> { typedef s32 wide; typedef  s8 narrow; typedef u16 unsign; typedef   int16x8_t vec128; };
+template <> struct TypeTraits<u32> { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef  uint32x4_t vec128; };
+template <> struct TypeTraits<s32> { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef   int32x4_t vec128; };
+template <> struct TypeTraits<f32> { typedef f64 wide;                                         typedef float32x4_t vec128; };
+
+template <typename T> struct wAdd
+{
+    typedef T type;
+
+    f32 alpha, beta, gamma;
+    typedef typename TypeTraits<T>::wide wtype;
+    wAdd<wtype> wideAdd;
+    wAdd(f32 _alpha, f32 _beta, f32 _gamma):
+        alpha(_alpha), beta(_beta), gamma(_gamma),
+        wideAdd(_alpha, _beta, _gamma) {}
+
+    void operator() (const typename VecTraits<T>::vec128 & v_src0,
+                     const typename VecTraits<T>::vec128 & v_src1,
+                     typename VecTraits<T>::vec128 & v_dst) const
+    {
+        typename VecTraits<wtype>::vec128 vrl, vrh;
+        wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl);
+        wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh);
+
+        v_dst = vcombine(vqmovn(vrl), vqmovn(vrh));
+    }
+
+    void operator() (const typename VecTraits<T>::vec64 & v_src0,
+                     const typename VecTraits<T>::vec64 & v_src1,
+                     typename VecTraits<T>::vec64 & v_dst) const
+    {
+        typename VecTraits<wtype>::vec128 vr;
+        wideAdd(vmovl(v_src0), vmovl(v_src1), vr);
+
+        v_dst = vqmovn(vr);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = saturate_cast<T>(alpha*src0[0] + beta*src1[0] + gamma);
+    }
+};
+
+template <> struct wAdd<s32>
+{
+    typedef s32 type;
+
+    f32 alpha, beta, gamma;
+    float32x4_t valpha, vbeta, vgamma;
+    wAdd(f32 _alpha, f32 _beta, f32 _gamma):
+        alpha(_alpha), beta(_beta), gamma(_gamma)
+    {
+        valpha = vdupq_n_f32(_alpha);
+        vbeta = vdupq_n_f32(_beta);
+        vgamma = vdupq_n_f32(_gamma + 0.5);
+    }
+
+    void operator() (const typename VecTraits<s32>::vec128 & v_src0,
+                     const typename VecTraits<s32>::vec128 & v_src1,
+                     typename VecTraits<s32>::vec128 & v_dst) const
+    {
+        float32x4_t vs1 = vcvtq_f32_s32(v_src0);
+        float32x4_t vs2 = vcvtq_f32_s32(v_src1);
+
+        vs1 = vmlaq_f32(vgamma, vs1, valpha);
+        vs1 = vmlaq_f32(vs1, vs2, vbeta);
+        v_dst = vcvtq_s32_f32(vs1);
+    }
+
+    void operator() (const typename VecTraits<s32>::vec64 & v_src0,
+                     const typename VecTraits<s32>::vec64 & v_src1,
+                     typename VecTraits<s32>::vec64 & v_dst) const
+    {
+        float32x2_t vs1 = vcvt_f32_s32(v_src0);
+        float32x2_t vs2 = vcvt_f32_s32(v_src1);
+
+        vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
+        vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
+        v_dst = vcvt_s32_f32(vs1);
+    }
+
+    void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
+    {
+        dst[0] = saturate_cast<s32>(alpha*src0[0] + beta*src1[0] + gamma);
+    }
+};
+
+template <> struct wAdd<u32>
+{
+    typedef u32 type;
+
+    f32 alpha, beta, gamma;
+    float32x4_t valpha, vbeta, vgamma;
+    wAdd(f32 _alpha, f32 _beta, f32 _gamma):
+        alpha(_alpha), beta(_beta), gamma(_gamma)
+    {
+        valpha = vdupq_n_f32(_alpha);
+        vbeta = vdupq_n_f32(_beta);
+        vgamma = vdupq_n_f32(_gamma + 0.5);
+    }
+
+    void operator() (const typename VecTraits<u32>::vec128 & v_src0,
+                     const typename VecTraits<u32>::vec128 & v_src1,
+                     typename VecTraits<u32>::vec128 & v_dst) const
+    {
+        float32x4_t vs1 = vcvtq_f32_u32(v_src0);
+        float32x4_t vs2 = vcvtq_f32_u32(v_src1);
+
+        vs1 = vmlaq_f32(vgamma, vs1, valpha);
+        vs1 = vmlaq_f32(vs1, vs2, vbeta);
+        v_dst = vcvtq_u32_f32(vs1);
+    }
+
+    void operator() (const typename VecTraits<u32>::vec64 & v_src0,
+                     const typename VecTraits<u32>::vec64 & v_src1,
+                     typename VecTraits<u32>::vec64 & v_dst) const
+    {
+        float32x2_t vs1 = vcvt_f32_u32(v_src0);
+        float32x2_t vs2 = vcvt_f32_u32(v_src1);
+
+        vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
+        vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
+        v_dst = vcvt_u32_f32(vs1);
+    }
+
+    void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
+    {
+        dst[0] = saturate_cast<u32>(alpha*src0[0] + beta*src1[0] + gamma);
+    }
+};
+
+template <> struct wAdd<f32>
+{
+    typedef f32 type;
+
+    f32 alpha, beta, gamma;
+    float32x4_t valpha, vbeta, vgamma;
+    wAdd(f32 _alpha, f32 _beta, f32 _gamma):
+        alpha(_alpha), beta(_beta), gamma(_gamma)
+    {
+        valpha = vdupq_n_f32(_alpha);
+        vbeta = vdupq_n_f32(_beta);
+        vgamma = vdupq_n_f32(_gamma + 0.5);
+    }
+
+    void operator() (const typename VecTraits<f32>::vec128 & v_src0,
+                     const typename VecTraits<f32>::vec128 & v_src1,
+                     typename VecTraits<f32>::vec128 & v_dst) const
+    {
+        float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha);
+        v_dst = vmlaq_f32(vs1, v_src1, vbeta);
+    }
+
+    void operator() (const typename VecTraits<f32>::vec64 & v_src0,
+                     const typename VecTraits<f32>::vec64 & v_src1,
+                     typename VecTraits<f32>::vec64 & v_dst) const
+    {
+        float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha));
+        v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta));
+
+    }
+
+    void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
+    {
+        dst[0] = alpha*src0[0] + beta*src1[0] + gamma;
+    }
+};
+
+} // namespace
+
+#define IMPL_ADDWEIGHTED(type)                                \
+void addWeighted(const Size2D &size,                          \
+                 const type * src0Base, ptrdiff_t src0Stride, \
+                 const type * src1Base, ptrdiff_t src1Stride, \
+                 type * dstBase, ptrdiff_t dstStride,         \
+                 f32 alpha, f32 beta, f32 gamma)              \
+{                                                             \
+    internal::assertSupportedConfiguration();                 \
+    wAdd<type> wgtAdd(alpha,                                  \
+                      beta,                                   \
+                      gamma);                                 \
+    internal::vtransform(size,                                \
+                         src0Base, src0Stride,                \
+                         src1Base, src1Stride,                \
+                         dstBase, dstStride,                  \
+                         wgtAdd);                             \
+}
+
+#else
+
+#define IMPL_ADDWEIGHTED(type)                                \
+void addWeighted(const Size2D &,                              \
+                 const type *, ptrdiff_t,                     \
+                 const type *, ptrdiff_t,                     \
+                 type *, ptrdiff_t,                           \
+                 f32, f32, f32)                               \
+{                                                             \
+    internal::assertSupportedConfiguration();                 \
+}
+
+#endif
+
+IMPL_ADDWEIGHTED(u8)
+IMPL_ADDWEIGHTED(s8)
+IMPL_ADDWEIGHTED(u16)
+IMPL_ADDWEIGHTED(s16)
+IMPL_ADDWEIGHTED(u32)
+IMPL_ADDWEIGHTED(s32)
+IMPL_ADDWEIGHTED(f32)
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/bitwise.cpp
+++ b/3rdparty/carotene/src/bitwise.cpp
@ -0,0 +1,225 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+struct BitwiseAnd
+{
+    typedef u8 type;
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        v_dst = vandq_u8(v_src0, v_src1);
+    }
+
+    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        v_dst = vand_u8(v_src0, v_src1);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] & src1[0];
+    }
+};
+
+struct BitwiseOr
+{
+    typedef u8 type;
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        v_dst = vorrq_u8(v_src0, v_src1);
+    }
+
+    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        v_dst = vorr_u8(v_src0, v_src1);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] | src1[0];
+    }
+};
+
+struct BitwiseXor
+{
+    typedef u8 type;
+
+    void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
+                     uint8x16_t & v_dst) const
+    {
+        v_dst = veorq_u8(v_src0, v_src1);
+    }
+
+    void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
+                     uint8x8_t & v_dst) const
+    {
+        v_dst = veor_u8(v_src0, v_src1);
+    }
+
+    void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] ^ src1[0];
+    }
+};
+
+#endif
+
+void bitwiseNot(const Size2D &size,
+                const u8 *srcBase, ptrdiff_t srcStride,
+                u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8* src = internal::getRowPtr(srcBase, srcStride, i);
+        u8* dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16);
+            uint8x16_t v_dst0 = vmvnq_u8(v_src0), v_dst1 = vmvnq_u8(v_src1);
+            vst1q_u8(dst + j, v_dst0);
+            vst1q_u8(dst + j + 16, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src = vld1_u8(src + j);
+            uint8x8_t v_dst = vmvn_u8(v_src);
+            vst1_u8(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+        {
+            dst[j] = ~src[j];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bitwiseAnd(const Size2D &size,
+                const u8 *src0Base, ptrdiff_t src0Stride,
+                const u8 *src1Base, ptrdiff_t src1Stride,
+                u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, BitwiseAnd());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bitwiseOr(const Size2D &size,
+               const u8 *src0Base, ptrdiff_t src0Stride,
+               const u8 *src1Base, ptrdiff_t src1Stride,
+               u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, BitwiseOr());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void bitwiseXor(const Size2D &size,
+                const u8 *src0Base, ptrdiff_t src0Stride,
+                const u8 *src1Base, ptrdiff_t src1Stride,
+                u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride, BitwiseXor());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/blur.cpp
+++ b/3rdparty/carotene/src/blur.cpp
--- a/3rdparty/carotene/src/canny.cpp
+++ b/3rdparty/carotene/src/canny.cpp
@ -0,0 +1,773 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "saturate_cast.hpp"
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+namespace {
+struct RowFilter3x3Canny
+{
+    inline RowFilter3x3Canny(const ptrdiff_t borderxl, const ptrdiff_t borderxr)
+    {
+        vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
+        vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL));
+        lookLeft = offsetk - borderxl;
+        lookRight = offsetk - borderxr;
+    }
+
+    inline void operator()(const u8* src, s16* dstx, s16* dsty, ptrdiff_t width)
+    {
+        uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
+        ptrdiff_t i = 0;
+        for (; i < width - 8 + lookRight; i += 8)
+        {
+            internal::prefetch(src + i);
+            uint8x8_t l18u = vld1_u8(src + i + 1);
+
+            uint8x8_t l2 = l18u;
+            uint8x8_t l0 = vext_u8(l, l18u, 6);
+            int16x8_t l1x2 = vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l18u, 7), 1));
+
+            l = l18u;
+
+            int16x8_t l02 = vreinterpretq_s16_u16(vaddl_u8(l2, l0));
+            int16x8_t ldx = vreinterpretq_s16_u16(vsubl_u8(l2, l0));
+            int16x8_t ldy = vaddq_s16(l02, l1x2);
+
+            vst1q_s16(dstx + i, ldx);
+            vst1q_s16(dsty + i, ldy);
+        }
+
+        //tail
+        if (lookRight == 0 || i != width)
+        {
+            uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
+            uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
+            uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
+
+            int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail2, tail0));
+            int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
+            int16x8_t taildx = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0));
+            int16x8_t taildy = vqaddq_s16(tail02, tail1x2);
+
+            vst1q_s16(dstx + (width - 8), taildx);
+            vst1q_s16(dsty + (width - 8), taildy);
+        }
+    }
+
+    uint8x8_t vfmask;
+    uint8x8_t vtmask;
+    enum { offsetk = 1};
+    ptrdiff_t lookLeft;
+    ptrdiff_t lookRight;
+};
+
+template <bool L2gradient>
+inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
+{
+    ptrdiff_t j = 0;
+    for (; j <= width - 8; j += 8)
+    {
+        ColFilter3x3CannyL1Loop:
+        int16x8_t line0x = vld1q_s16(src0 + j);
+        int16x8_t line1x = vld1q_s16(src1 + j);
+        int16x8_t line2x = vld1q_s16(src2 + j);
+        int16x8_t line0y = vld1q_s16(src0 + j + width);
+        int16x8_t line2y = vld1q_s16(src2 + j + width);
+
+        int16x8_t l02 = vaddq_s16(line0x, line2x);
+        int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
+        int16x8_t dy = vsubq_s16(line2y, line0y);
+        int16x8_t dx = vaddq_s16(l1x2, l02);
+
+        int16x8_t dya = vabsq_s16(dy);
+        int16x8_t dxa = vabsq_s16(dx);
+        int16x8_t norm = vaddq_s16(dya, dxa);
+
+        int32x4_t normh = vmovl_s16(vget_high_s16(norm));
+        int32x4_t norml = vmovl_s16(vget_low_s16(norm));
+
+        vst1q_s16(dsty + j, dy);
+        vst1q_s16(dstx + j, dx);
+        vst1q_s32(mag + j + 4, normh);
+        vst1q_s32(mag + j, norml);
+    }
+    if (j != width)
+    {
+        j = width - 8;
+        goto ColFilter3x3CannyL1Loop;
+    }
+}
+template <>
+inline void ColFilter3x3Canny<true>(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
+{
+    ptrdiff_t j = 0;
+    for (; j <= width - 8; j += 8)
+    {
+        ColFilter3x3CannyL2Loop:
+        int16x8_t line0x = vld1q_s16(src0 + j);
+        int16x8_t line1x = vld1q_s16(src1 + j);
+        int16x8_t line2x = vld1q_s16(src2 + j);
+        int16x8_t line0y = vld1q_s16(src0 + j + width);
+        int16x8_t line2y = vld1q_s16(src2 + j + width);
+
+        int16x8_t l02 = vaddq_s16(line0x, line2x);
+        int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
+        int16x8_t dy = vsubq_s16(line2y, line0y);
+        int16x8_t dx = vaddq_s16(l1x2, l02);
+
+        int32x4_t norml = vmull_s16(vget_low_s16(dx), vget_low_s16(dx));
+        int32x4_t normh = vmull_s16(vget_high_s16(dy), vget_high_s16(dy));
+
+        norml = vmlal_s16(norml, vget_low_s16(dy), vget_low_s16(dy));
+        normh = vmlal_s16(normh, vget_high_s16(dx), vget_high_s16(dx));
+
+        vst1q_s16(dsty + j, dy);
+        vst1q_s16(dstx + j, dx);
+        vst1q_s32(mag + j, norml);
+        vst1q_s32(mag + j + 4, normh);
+    }
+    if (j != width)
+    {
+        j = width - 8;
+        goto ColFilter3x3CannyL2Loop;
+    }
+}
+
+template <bool L2gradient>
+inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
+{
+    ptrdiff_t j = 0;
+    if (colscn >= 8)
+    {
+        int16x8_t vx = vld1q_s16(_dx);
+        int16x8_t vy = vld1q_s16(_dy);
+        for (; j <= colscn - 16; j+=8)
+        {
+            internal::prefetch(_dx);
+            internal::prefetch(_dy);
+
+            int16x8_t vx2 = vld1q_s16(_dx + j + 8);
+            int16x8_t vy2 = vld1q_s16(_dy + j + 8);
+
+            int16x8_t vabsx = vabsq_s16(vx);
+            int16x8_t vabsy = vabsq_s16(vy);
+
+            int16x8_t norm = vaddq_s16(vabsx, vabsy);
+
+            int32x4_t normh = vmovl_s16(vget_high_s16(norm));
+            int32x4_t norml = vmovl_s16(vget_low_s16(norm));
+
+            vst1q_s32(_norm + j + 4, normh);
+            vst1q_s32(_norm + j + 0, norml);
+
+            vx = vx2;
+            vy = vy2;
+        }
+        int16x8_t vabsx = vabsq_s16(vx);
+        int16x8_t vabsy = vabsq_s16(vy);
+
+        int16x8_t norm = vaddq_s16(vabsx, vabsy);
+
+        int32x4_t normh = vmovl_s16(vget_high_s16(norm));
+        int32x4_t norml = vmovl_s16(vget_low_s16(norm));
+
+        vst1q_s32(_norm + j + 4, normh);
+        vst1q_s32(_norm + j + 0, norml);
+    }
+    for (; j < colscn; j++)
+        _norm[j] = std::abs(s32(_dx[j])) + std::abs(s32(_dy[j]));
+}
+
+template <>
+inline void NormCanny<true>(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
+{
+    ptrdiff_t j = 0;
+    if (colscn >= 8)
+    {
+        int16x8_t vx = vld1q_s16(_dx);
+        int16x8_t vy = vld1q_s16(_dy);
+
+        for (; j <= colscn - 16; j+=8)
+        {
+            internal::prefetch(_dx);
+            internal::prefetch(_dy);
+
+            int16x8_t vxnext = vld1q_s16(_dx + j + 8);
+            int16x8_t vynext = vld1q_s16(_dy + j + 8);
+
+            int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
+            int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
+
+            norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
+            normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
+
+            vst1q_s32(_norm + j + 0, norml);
+            vst1q_s32(_norm + j + 4, normh);
+
+            vx = vxnext;
+            vy = vynext;
+        }
+        int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
+        int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
+
+        norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
+        normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
+
+        vst1q_s32(_norm + j + 0, norml);
+        vst1q_s32(_norm + j + 4, normh);
+    }
+    for (; j < colscn; j++)
+        _norm[j] = s32(_dx[j])*_dx[j] + s32(_dy[j])*_dy[j];
+}
+
+template <bool L2gradient>
+inline void prepareThresh(f64 low_thresh, f64 high_thresh,
+                          s32 &low, s32 &high)
+{
+    if (low_thresh > high_thresh)
+        std::swap(low_thresh, high_thresh);
+#if defined __GNUC__
+    low = (s32)low_thresh;
+    high = (s32)high_thresh;
+    low -= (low > low_thresh);
+    high -= (high > high_thresh);
+#else
+    low = internal::round(low_thresh);
+    high = internal::round(high_thresh);
+    f32 ldiff = (f32)(low_thresh - low);
+    f32 hdiff = (f32)(high_thresh - high);
+    low -= (ldiff < 0);
+    high -= (hdiff < 0);
+#endif
+}
+template <>
+inline void prepareThresh<true>(f64 low_thresh, f64 high_thresh,
+                                s32 &low, s32 &high)
+{
+    if (low_thresh > high_thresh)
+        std::swap(low_thresh, high_thresh);
+    if (low_thresh > 0) low_thresh *= low_thresh;
+    if (high_thresh > 0) high_thresh *= high_thresh;
+#if defined __GNUC__
+    low = (s32)low_thresh;
+    high = (s32)high_thresh;
+    low -= (low > low_thresh);
+    high -= (high > high_thresh);
+#else
+    low = internal::round(low_thresh);
+    high = internal::round(high_thresh);
+    f32 ldiff = (f32)(low_thresh - low);
+    f32 hdiff = (f32)(high_thresh - high);
+    low -= (ldiff < 0);
+    high -= (hdiff < 0);
+#endif
+}
+
+template <bool L2gradient, bool externalSobel>
+struct _normEstimator
+{
+    ptrdiff_t magstep;
+    ptrdiff_t dxOffset;
+    ptrdiff_t dyOffset;
+    ptrdiff_t shxOffset;
+    ptrdiff_t shyOffset;
+    std::vector<u8> buffer;
+    const ptrdiff_t offsetk;
+    ptrdiff_t borderyt, borderyb;
+    RowFilter3x3Canny sobelRow;
+
+    inline _normEstimator(const Size2D &size, s32, Margin borderMargin,
+                          ptrdiff_t &mapstep, s32** mag_buf, u8* &map):
+                          offsetk(1),
+                          sobelRow(std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left),
+                                   std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right))
+    {
+        mapstep = size.width + 2;
+        magstep = size.width + 2 + size.width * (4 * sizeof(s16)/sizeof(s32));
+        dxOffset = mapstep * sizeof(s32)/sizeof(s16);
+        dyOffset = dxOffset + size.width * 1;
+        shxOffset = dxOffset + size.width * 2;
+        shyOffset = dxOffset + size.width * 3;
+        buffer.resize( (size.width+2)*(size.height+2) + magstep*3*sizeof(s32) );
+        mag_buf[0] = (s32*)&buffer[0];
+        mag_buf[1] = mag_buf[0] + magstep;
+        mag_buf[2] = mag_buf[1] + magstep;
+        memset(mag_buf[0], 0, mapstep * sizeof(s32));
+
+        map = (u8*)(mag_buf[2] + magstep);
+        memset(map, 1, mapstep);
+        memset(map + mapstep*(size.height + 1), 1, mapstep);
+        borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top);
+        borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom);
+    }
+    inline void firstRow(const Size2D &size, s32,
+                         const u8 *srcBase, ptrdiff_t srcStride,
+                         s16*, ptrdiff_t,
+                         s16*, ptrdiff_t,
+                         s32** mag_buf)
+    {
+        //sobelH row #0
+        const u8* _src = internal::getRowPtr(srcBase, srcStride, 0);
+        sobelRow(_src, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shyOffset, size.width);
+        //sobelH row #1
+        _src = internal::getRowPtr(srcBase, srcStride, 1);
+        sobelRow(_src, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shyOffset, size.width);
+
+        mag_buf[1][0] = mag_buf[1][size.width+1] = 0;
+        if (borderyt == 0)
+        {
+            //sobelH row #-1
+            _src = internal::getRowPtr(srcBase, srcStride, -1);
+            sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
+
+            ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
+                                           ((s16*)mag_buf[1]) + dxOffset,  ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
+        }
+        else
+        {
+            ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
+                                           ((s16*)mag_buf[1]) + dxOffset,  ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
+        }
+    }
+    inline void nextRow(const Size2D &size, s32,
+                        const u8 *srcBase, ptrdiff_t srcStride,
+                        s16*, ptrdiff_t,
+                        s16*, ptrdiff_t,
+                        const ptrdiff_t &mapstep, s32** mag_buf,
+                        size_t i, const s16* &_x, const s16* &_y)
+    {
+        mag_buf[2][0] = mag_buf[2][size.width+1] = 0;
+        if (i < size.height - borderyb)
+        {
+            const u8* _src = internal::getRowPtr(srcBase, srcStride, i+1);
+            //sobelH row #i+1
+            sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
+
+            ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[2]) + shxOffset,
+                                           ((s16*)mag_buf[2]) + dxOffset,  ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
+        }
+        else if (i < size.height)
+        {
+            ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
+                                           ((s16*)mag_buf[2]) + dxOffset,  ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
+        }
+        else
+            memset(mag_buf[2], 0, mapstep*sizeof(s32));
+        _x = ((s16*)mag_buf[1]) + dxOffset;
+        _y = ((s16*)mag_buf[1]) + dyOffset;
+    }
+};
+template <bool L2gradient>
+struct _normEstimator<L2gradient, true>
+{
+    std::vector<u8> buffer;
+
+    inline _normEstimator(const Size2D &size, s32 cn, Margin,
+                          ptrdiff_t &mapstep, s32** mag_buf, u8* &map)
+    {
+        mapstep = size.width + 2;
+        buffer.resize( (size.width+2)*(size.height+2) + cn*mapstep*3*sizeof(s32) );
+        mag_buf[0] = (s32*)&buffer[0];
+        mag_buf[1] = mag_buf[0] + mapstep*cn;
+        mag_buf[2] = mag_buf[1] + mapstep*cn;
+        memset(mag_buf[0], 0, /* cn* */mapstep * sizeof(s32));
+
+        map = (u8*)(mag_buf[2] + mapstep*cn);
+        memset(map, 1, mapstep);
+        memset(map + mapstep*(size.height + 1), 1, mapstep);
+    }
+    inline void firstRow(const Size2D &size, s32 cn,
+                         const u8 *, ptrdiff_t,
+                         s16* dxBase, ptrdiff_t dxStride,
+                         s16* dyBase, ptrdiff_t dyStride,
+                         s32** mag_buf)
+    {
+        s32* _norm = mag_buf[1] + 1;
+
+        s16* _dx = internal::getRowPtr(dxBase, dxStride, 0);
+        s16* _dy = internal::getRowPtr(dyBase, dyStride, 0);
+
+        NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
+
+        if(cn > 1)
+        {
+            for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
+            {
+                size_t maxIdx = jn;
+                for(s32 k = 1; k < cn; ++k)
+                    if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
+                _norm[j] = _norm[maxIdx];
+                _dx[j] = _dx[maxIdx];
+                _dy[j] = _dy[maxIdx];
+            }
+        }
+
+        _norm[-1] = _norm[size.width] = 0;
+    }
+    inline void nextRow(const Size2D &size, s32 cn,
+                        const u8 *, ptrdiff_t,
+                        s16* dxBase, ptrdiff_t dxStride,
+                        s16* dyBase, ptrdiff_t dyStride,
+                        const ptrdiff_t &mapstep, s32** mag_buf,
+                        size_t i, const s16* &_x, const s16* &_y)
+    {
+        s32* _norm = mag_buf[(i > 0) + 1] + 1;
+        if (i < size.height)
+        {
+            s16* _dx = internal::getRowPtr(dxBase, dxStride, i);
+            s16* _dy = internal::getRowPtr(dyBase, dyStride, i);
+
+            NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
+
+            if(cn > 1)
+            {
+                for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
+                {
+                    size_t maxIdx = jn;
+                    for(s32 k = 1; k < cn; ++k)
+                        if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
+                    _norm[j] = _norm[maxIdx];
+                    _dx[j] = _dx[maxIdx];
+                    _dy[j] = _dy[maxIdx];
+                }
+            }
+
+            _norm[-1] = _norm[size.width] = 0;
+        }
+        else
+            memset(_norm-1, 0, /* cn* */mapstep*sizeof(s32));
+
+        _x = internal::getRowPtr(dxBase, dxStride, i-1);
+        _y = internal::getRowPtr(dyBase, dyStride, i-1);
+    }
+};
+
+template <bool L2gradient, bool externalSobel>
+inline void Canny3x3(const Size2D &size, s32 cn,
+                     const u8 * srcBase, ptrdiff_t srcStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     s16 * dxBase, ptrdiff_t dxStride,
+                     s16 * dyBase, ptrdiff_t dyStride,
+                     f64 low_thresh, f64 high_thresh,
+                     Margin borderMargin)
+{
+    s32 low, high;
+    prepareThresh<L2gradient>(low_thresh, high_thresh, low, high);
+
+    ptrdiff_t mapstep;
+    s32* mag_buf[3];
+    u8* map;
+    _normEstimator<L2gradient, externalSobel> normEstimator(size, cn, borderMargin, mapstep, mag_buf, map);
+
+    size_t maxsize = std::max<size_t>( 1u << 10, size.width * size.height / 10 );
+    std::vector<u8*> stack( maxsize );
+    u8 **stack_top = &stack[0];
+    u8 **stack_bottom = &stack[0];
+
+    /* sector numbers
+       (Top-Left Origin)
+
+        1   2   3
+         *  *  *
+          * * *
+        0*******0
+          * * *
+         *  *  *
+        3   2   1
+    */
+
+    #define CANNY_PUSH(d)    *(d) = u8(2), *stack_top++ = (d)
+    #define CANNY_POP(d)     (d) = *--stack_top
+
+    //i == 0
+    normEstimator.firstRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mag_buf);
+    // calculate magnitude and angle of gradient, perform non-maxima supression.
+    // fill the map with one of the following values:
+    //   0 - the pixel might belong to an edge
+    //   1 - the pixel can not belong to an edge
+    //   2 - the pixel does belong to an edge
+    for (size_t i = 1; i <= size.height; i++)
+    {
+        const s16 *_x, *_y;
+        normEstimator.nextRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mapstep, mag_buf, i, _x, _y);
+
+        u8* _map = map + mapstep*i + 1;
+        _map[-1] = _map[size.width] = 1;
+
+        s32* _mag = mag_buf[1] + 1; // take the central row
+        ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
+        ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
+
+        if ((stack_top - stack_bottom) + size.width > maxsize)
+        {
+            ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
+            maxsize = maxsize * 3/2;
+            stack.resize(maxsize);
+            stack_bottom = &stack[0];
+            stack_top = stack_bottom + sz;
+        }
+
+        s32 prev_flag = 0;
+        for (ptrdiff_t j = 0; j < (ptrdiff_t)size.width; j++)
+        {
+            #define CANNY_SHIFT 15
+            const s32 TG22 = (s32)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+            s32 m = _mag[j];
+
+            if (m > low)
+            {
+                s32 xs = _x[j];
+                s32 ys = _y[j];
+                s32 x = abs(xs);
+                s32 y = abs(ys) << CANNY_SHIFT;
+
+                s32 tg22x = x * TG22;
+
+                if (y < tg22x)
+                {
+                    if (m > _mag[j-1] && m >= _mag[j+1]) goto __push;
+                }
+                else
+                {
+                    s32 tg67x = tg22x + (x << (CANNY_SHIFT+1));
+                    if (y > tg67x)
+                    {
+                        if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __push;
+                    }
+                    else
+                    {
+                        s32 s = (xs ^ ys) < 0 ? -1 : 1;
+                        if(m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __push;
+                    }
+                }
+            }
+            prev_flag = 0;
+            _map[j] = u8(1);
+            continue;
+            __push:
+            if (!prev_flag && m > high && _map[j-mapstep] != 2)
+            {
+                CANNY_PUSH(_map + j);
+                prev_flag = 1;
+            }
+            else
+                _map[j] = 0;
+        }
+
+        // scroll the ring buffer
+        _mag = mag_buf[0];
+        mag_buf[0] = mag_buf[1];
+        mag_buf[1] = mag_buf[2];
+        mag_buf[2] = _mag;
+    }
+
+    // now track the edges (hysteresis thresholding)
+    while (stack_top > stack_bottom)
+    {
+        u8* m;
+        if ((size_t)(stack_top - stack_bottom) + 8u > maxsize)
+        {
+            ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
+            maxsize = maxsize * 3/2;
+            stack.resize(maxsize);
+            stack_bottom = &stack[0];
+            stack_top = stack_bottom + sz;
+        }
+
+        CANNY_POP(m);
+
+        if (!m[-1])         CANNY_PUSH(m - 1);
+        if (!m[1])          CANNY_PUSH(m + 1);
+        if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
+        if (!m[-mapstep])   CANNY_PUSH(m - mapstep);
+        if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
+        if (!m[mapstep-1])  CANNY_PUSH(m + mapstep - 1);
+        if (!m[mapstep])    CANNY_PUSH(m + mapstep);
+        if (!m[mapstep+1])  CANNY_PUSH(m + mapstep + 1);
+    }
+
+    // the final pass, form the final image
+    uint8x16_t v2 = vmovq_n_u8(2);
+    const u8* ptrmap = map + mapstep + 1;
+    for (size_t i = 0; i < size.height; i++, ptrmap += mapstep)
+    {
+        u8* _dst = internal::getRowPtr(dstBase, dstStride, i);
+        ptrdiff_t j = 0;
+        for (; j < (ptrdiff_t)size.width - 16; j += 16)
+        {
+            internal::prefetch(ptrmap);
+            uint8x16_t vmap = vld1q_u8(ptrmap + j);
+            uint8x16_t vdst = vceqq_u8(vmap, v2);
+            vst1q_u8(_dst+j, vdst);
+        }
+        for (; j < (ptrdiff_t)size.width; j++)
+            _dst[j] = (u8)-(ptrmap[j] >> 1);
+    }
+}
+
+} // namespace
+#endif
+
+bool isCanny3x3Supported(const Size2D &size)
+{
+    return isSupportedConfiguration() &&
+           size.height >= 2 && size.width >= 9;
+}
+
+void Canny3x3L1(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                f64 low_thresh, f64 high_thresh,
+                Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isCanny3x3Supported(size));
+#ifdef CAROTENE_NEON
+    Canny3x3<false, false>(size, 1,
+                           srcBase, srcStride,
+                           dstBase, dstStride,
+                           NULL, 0,
+                           NULL, 0,
+                           low_thresh, high_thresh,
+                           borderMargin);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)low_thresh;
+    (void)high_thresh;
+    (void)borderMargin;
+#endif
+}
+
+void Canny3x3L2(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                f64 low_thresh, f64 high_thresh,
+                Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isCanny3x3Supported(size));
+#ifdef CAROTENE_NEON
+    Canny3x3<true, false>(size, 1,
+                          srcBase, srcStride,
+                          dstBase, dstStride,
+                          NULL, 0,
+                          NULL, 0,
+                          low_thresh, high_thresh,
+                          borderMargin);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)low_thresh;
+    (void)high_thresh;
+    (void)borderMargin;
+#endif
+}
+
+void Canny3x3L1(const Size2D &size, s32 cn,
+                     s16 * dxBase, ptrdiff_t dxStride,
+                     s16 * dyBase, ptrdiff_t dyStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     f64 low_thresh, f64 high_thresh)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Canny3x3<false, true>(size, cn,
+                          NULL, 0,
+                          dstBase, dstStride,
+                          dxBase, dxStride,
+                          dyBase, dyStride,
+                          low_thresh, high_thresh,
+                          Margin());
+#else
+    (void)size;
+    (void)cn;
+    (void)dstBase;
+    (void)dstStride;
+    (void)dxBase;
+    (void)dxStride;
+    (void)dyBase;
+    (void)dyStride;
+    (void)low_thresh;
+    (void)high_thresh;
+#endif
+}
+
+void Canny3x3L2(const Size2D &size, s32 cn,
+                     s16 * dxBase, ptrdiff_t dxStride,
+                     s16 * dyBase, ptrdiff_t dyStride,
+                     u8 * dstBase, ptrdiff_t dstStride,
+                     f64 low_thresh, f64 high_thresh)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Canny3x3<true, true>(size, cn,
+                         NULL, 0,
+                         dstBase, dstStride,
+                         dxBase, dxStride,
+                         dyBase, dyStride,
+                         low_thresh, high_thresh,
+                         Margin());
+#else
+    (void)size;
+    (void)cn;
+    (void)dstBase;
+    (void)dstStride;
+    (void)dxBase;
+    (void)dxStride;
+    (void)dyBase;
+    (void)dyStride;
+    (void)low_thresh;
+    (void)high_thresh;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/channel_extract.cpp
+++ b/3rdparty/carotene/src/channel_extract.cpp
@ -0,0 +1,486 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+void extract2(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              u32 coi)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; dj < roiw32; sj += 64, dj += 32)
+        {
+            internal::prefetch(src + sj);
+
+            uint8x16x2_t v_src = vld2q_u8(src + sj);
+            vst1q_u8(dst + dj, v_src.val[coi]);
+
+            v_src = vld2q_u8(src + sj + 32);
+            vst1q_u8(dst + dj + 16, v_src.val[coi]);
+        }
+#endif
+
+        for (; dj < roiw8; sj += 16, dj += 8)
+        {
+            uint8x8x2_t v_src = vld2_u8(src + sj);
+            vst1_u8(dst + dj, v_src.val[coi]);
+        }
+
+        for (; dj < size.width; sj += 2, ++dj)
+        {
+            dst[dj] = src[sj + coi];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)coi;
+#endif
+}
+
+void extract3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              u32 coi)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; dj < roiw32; sj += 96, dj += 32)
+        {
+            internal::prefetch(src + sj);
+
+            uint8x16x3_t v_src = vld3q_u8(src + sj);
+            vst1q_u8(dst + dj, v_src.val[coi]);
+
+            v_src = vld3q_u8(src + sj + 48);
+            vst1q_u8(dst + dj + 16, v_src.val[coi]);
+        }
+#endif
+
+        for (; dj < roiw8; sj += 24, dj += 8)
+        {
+            uint8x8x3_t v_src = vld3_u8(src + sj);
+            vst1_u8(dst + dj, v_src.val[coi]);
+        }
+
+        for (; dj < size.width; sj += 3, ++dj)
+        {
+            dst[dj] = src[sj + coi];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)coi;
+#endif
+}
+
+void extract4(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              u32 coi)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; dj < roiw32; sj += 128, dj += 32)
+        {
+            internal::prefetch(src + sj);
+
+            uint8x16x4_t v_src = vld4q_u8(src + sj);
+            vst1q_u8(dst + dj, v_src.val[coi]);
+
+            v_src = vld4q_u8(src + sj + 64);
+            vst1q_u8(dst + dj + 16, v_src.val[coi]);
+        }
+#endif
+
+        for (; dj < roiw8; sj += 32, dj += 8)
+        {
+            uint8x8x4_t v_src = vld4_u8(src + sj);
+            vst1_u8(dst + dj, v_src.val[coi]);
+        }
+
+        for (; dj < size.width; sj += 4, ++dj)
+        {
+            dst[dj] = src[sj + coi];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)coi;
+#endif
+}
+
+#define FILL_LINES2(macro,type) \
+            macro##_LINE(type,0) \
+            macro##_LINE(type,1)
+#define FILL_LINES3(macro,type) \
+            FILL_LINES2(macro,type) \
+            macro##_LINE(type,2)
+#define FILL_LINES4(macro,type) \
+            FILL_LINES3(macro,type) \
+            macro##_LINE(type,3)
+
+#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
+
+#ifdef CAROTENE_NEON
+
+#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
+#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
+#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
+#define SST_LINE(type, n) dst##n[dj] = src[sj + n];
+
+#define MUL2(val) (val << 1)
+#define MUL3(val) (MUL2(val) + val)
+#define MUL4(val) (val << 2)
+
+#define CONTDST2 srcStride == dst0Stride && \
+                 srcStride == dst1Stride &&
+#define CONTDST3 srcStride == dst0Stride && \
+                 srcStride == dst1Stride && \
+                 srcStride == dst2Stride &&
+#define CONTDST4 srcStride == dst0Stride && \
+                 srcStride == dst1Stride && \
+                 srcStride == dst2Stride && \
+                 srcStride == dst3Stride &&
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define SPLIT_ASM2(sgn, bits) __asm__ ( \
+                                          "vld2." #bits " {d0, d2}, [%[in0]]            \n\t" \
+                                          "vld2." #bits " {d1, d3}, [%[in1]]            \n\t" \
+                                          "vst1." #bits " {d0-d1}, [%[out0]]            \n\t" \
+                                          "vst1." #bits " {d2-d3}, [%[out1]]            \n\t" \
+                                          : \
+                                          : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
+                                            [in0]  "r" (src + sj), [in1]  "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3" \
+                                      );
+#define SPLIT_ASM3(sgn, bits) __asm__ ( \
+                                          "vld3." #bits " {d0, d2, d4}, [%[in0]]        \n\t" \
+                                          "vld3." #bits " {d1, d3, d5}, [%[in1]]        \n\t" \
+                                          "vst1." #bits " {d0-d1}, [%[out0]]            \n\t" \
+                                          "vst1." #bits " {d2-d3}, [%[out1]]            \n\t" \
+                                          "vst1." #bits " {d4-d5}, [%[out2]]            \n\t" \
+                                          : \
+                                          : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
+                                            [in0]  "r" (src + sj), [in1]  "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3","d4","d5" \
+                                      );
+#define SPLIT_ASM4(sgn, bits) __asm__ ( \
+                                          "vld4." #bits " {d0, d2, d4, d6}, [%[in0]]    \n\t" \
+                                          "vld4." #bits " {d1, d3, d5, d7}, [%[in1]]    \n\t" \
+                                          "vst1." #bits " {d0-d1}, [%[out0]]            \n\t" \
+                                          "vst1." #bits " {d2-d3}, [%[out1]]            \n\t" \
+                                          "vst1." #bits " {d4-d5}, [%[out2]]            \n\t" \
+                                          "vst1." #bits " {d6-d7}, [%[out3]]            \n\t" \
+                                          : \
+                                          : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
+                                            [in0]  "r" (src + sj), [in1]  "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3","d4","d5","d6","d7" \
+                                      );
+
+#define SPLIT_QUAD(sgn, bits, n) { \
+                                     internal::prefetch(src + sj); \
+                                     SPLIT_ASM##n(sgn, bits) \
+                                 }
+
+#else
+
+#define SPLIT_QUAD(sgn, bits, n) { \
+                                     internal::prefetch(src + sj); \
+                                     vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
+                                     FILL_LINES##n(VST1Q, sgn##bits) \
+                                 }
+
+#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size,                                            \
+                                    const sgn##bits * srcBase, ptrdiff_t srcStride                      \
+                                    FILL_LINES##n(FARG, sgn##bits) )                                    \
+{                                                                                                       \
+    internal::assertSupportedConfiguration();                                                           \
+    Size2D size(_size);                                                                                 \
+    if (CONTDST##n                                                                                      \
+        dst0Stride == (ptrdiff_t)(size.width))                                                          \
+    {                                                                                                   \
+        size.width *= size.height;                                                                      \
+        size.height = 1;                                                                                \
+    }                                                                                                   \
+    typedef internal::VecTraits<sgn##bits, n>::vec128 vec128;                                           \
+    size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
+    typedef internal::VecTraits<sgn##bits, n>::vec64 vec64;                                             \
+    size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0;    \
+                                                                                                        \
+    for (size_t i = 0u; i < size.height; ++i)                                                           \
+    {                                                                                                   \
+        const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i);                             \
+        FILL_LINES##n(VROW, sgn##bits)                                                                  \
+        size_t sj = 0u, dj = 0u;                                                                        \
+                                                                                                        \
+        for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits))             \
+            SPLIT_QUAD(sgn, bits, n)                                                                    \
+                                                                                                        \
+        if (dj < roiw8)                                                                                 \
+        {                                                                                               \
+            vec64 v_src = vld##n##_##sgn##bits(src + sj);                                               \
+            FILL_LINES##n(VST1, sgn##bits)                                                              \
+            sj += MUL##n(8)/sizeof(sgn##bits);                                                          \
+            dj += 8/sizeof(sgn##bits);                                                                  \
+        }                                                                                               \
+                                                                                                        \
+        for (; dj < size.width; sj += n, ++dj)                                                          \
+        {                                                                                               \
+            FILL_LINES##n(SST, sgn##bits)                                                               \
+        }                                                                                               \
+    }                                                                                                   \
+}
+
+#define SPLIT64(sgn,n) void split##n(const Size2D &_size,                                               \
+                                     const sgn##64 * srcBase, ptrdiff_t srcStride                       \
+                                     FILL_LINES##n(FARG, sgn##64) )                                     \
+{                                                                                                       \
+    internal::assertSupportedConfiguration();                                                           \
+    Size2D size(_size);                                                                                 \
+    if (CONTDST##n                                                                                      \
+        dst0Stride == (ptrdiff_t)(size.width))                                                          \
+    {                                                                                                   \
+        size.width *= size.height;                                                                      \
+        size.height = 1;                                                                                \
+    }                                                                                                   \
+    typedef internal::VecTraits<sgn##64, n>::vec64 vec64;                                               \
+                                                                                                        \
+    for (size_t i = 0u; i < size.height; ++i)                                                           \
+    {                                                                                                   \
+        const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i);                               \
+        FILL_LINES##n(VROW, sgn##64)                                                                    \
+        size_t sj = 0u, dj = 0u;                                                                        \
+                                                                                                        \
+        for (; dj < size.width; sj += n, ++dj)                                                          \
+        {                                                                                               \
+            vec64 v_src = vld##n##_##sgn##64(src + sj);                                                 \
+            FILL_LINES##n(VST1, sgn##64)                                                                \
+        }                                                                                               \
+    }                                                                                                   \
+}
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define ALPHA_QUAD(sgn, bits) { \
+                                  internal::prefetch(src + sj); \
+                                  __asm__ ( \
+                                      "vld4." #bits " {d0, d2, d4, d6}, [%[in0]]    \n\t" \
+                                      "vld4." #bits " {d1, d3, d5, d7}, [%[in1]]    \n\t" \
+                                      "vst3." #bits " {d0, d2, d4}, [%[out3_1]]     \n\t" \
+                                      "vst3." #bits " {d1, d3, d5}, [%[out3_2]]     \n\t" \
+                                      "vst1." #bits " {d6-d7}, [%[out1]]            \n\t" \
+                                      : \
+                                      : [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
+                                        [in0]  "r" (src + sj), [in1]  "r" (src + sj + 32/sizeof(sgn##bits)) \
+                                      : "d0","d1","d2","d3","d4","d5","d6","d7" \
+                                  ); \
+                              }
+
+#else
+
+#define ALPHA_QUAD(sgn, bits) { \
+                                  internal::prefetch(src + sj); \
+                                  union { vec128_4 v4; vec128_3 v3; } vals; \
+                                  vals.v4 = vld4q_##sgn##bits(src + sj); \
+                                  vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
+                                  vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
+                              }
+
+#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size,                                          \
+                                          const sgn##bits * srcBase, ptrdiff_t srcStride,               \
+                                          sgn##bits * dst3Base, ptrdiff_t dst3Stride,                   \
+                                          sgn##bits * dst1Base, ptrdiff_t dst1Stride)                   \
+{                                                                                                       \
+    internal::assertSupportedConfiguration();                                                           \
+    Size2D size(_size);                                                                                 \
+    if (srcStride == dst3Stride &&                                                                      \
+        srcStride == dst1Stride &&                                                                      \
+        srcStride == (ptrdiff_t)(size.width))                                                           \
+    {                                                                                                   \
+        size.width *= size.height;                                                                      \
+        size.height = 1;                                                                                \
+    }                                                                                                   \
+    typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4;                                         \
+    typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3;                                         \
+    size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
+    typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4;                                           \
+    typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3;                                           \
+    size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0;    \
+                                                                                                        \
+    for (size_t i = 0u; i < size.height; ++i)                                                           \
+    {                                                                                                   \
+        const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i);                             \
+        sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i);                                \
+        sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i);                                \
+        size_t sj = 0u, d3j = 0u, d1j = 0u;                                                             \
+                                                                                                        \
+        for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits),       \
+                                                               d1j += 16/sizeof(sgn##bits))             \
+            ALPHA_QUAD(sgn, bits)                                                                       \
+                                                                                                        \
+        if (d1j < roiw8)                                                                                \
+        {                                                                                               \
+            union { vec64_4 v4; vec64_3 v3; } vals;                                                     \
+            vals.v4 = vld4_##sgn##bits(src + sj);                                                       \
+            vst3_u8(dst3 + d3j, vals.v3);                                                               \
+            vst1_u8(dst1 + d1j, vals.v4.val[3]);                                                        \
+            sj += MUL4(8)/sizeof(sgn##bits);                                                            \
+            d3j += MUL3(8)/sizeof(sgn##bits);                                                           \
+            d1j += 8/sizeof(sgn##bits);                                                                 \
+        }                                                                                               \
+                                                                                                        \
+        for (; d1j < size.width; sj += 4, d3j += 3, ++d1j)                                              \
+        {                                                                                               \
+            dst3[d3j+0] = src[sj + 0];                                                                  \
+            dst3[d3j+1] = src[sj + 1];                                                                  \
+            dst3[d3j+2] = src[sj + 2];                                                                  \
+            dst1[d1j]   = src[sj + 3];                                                                  \
+        }                                                                                               \
+    }                                                                                                   \
+}
+
+#else
+
+#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
+
+#define SPLIT(sgn,bits,n) void split##n(const Size2D &size,                                          \
+                                    const sgn##bits * srcBase, ptrdiff_t srcStride                   \
+                                    FILL_LINES##n(FARG, sgn##bits) )                                 \
+{                                                                                                    \
+    internal::assertSupportedConfiguration();                                                        \
+    (void)size;                                                                                      \
+    (void)srcBase;                                                                                   \
+    (void)srcStride;                                                                                 \
+    FILL_LINES##n(VOID, sgn##bits)                                                                   \
+}
+
+#define SPLIT64(sgn,n) SPLIT(sgn,64,n)
+
+#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size,                                        \
+                                          const sgn##bits * srcBase, ptrdiff_t srcStride,            \
+                                          sgn##bits * dst3Base, ptrdiff_t dst3Stride,                \
+                                          sgn##bits * dst1Base, ptrdiff_t dst1Stride)                \
+{                                                                                                    \
+    internal::assertSupportedConfiguration();                                                        \
+    (void)size;                                                                                      \
+    (void)srcBase;                                                                                   \
+    (void)srcStride;                                                                                 \
+    (void)dst3Base;                                                                                  \
+    (void)dst3Stride;                                                                                \
+    (void)dst1Base;                                                                                  \
+    (void)dst1Stride;                                                                                \
+}
+
+#endif //CAROTENE_NEON
+
+SPLIT(u, 8,2)
+SPLIT(u, 8,3)
+SPLIT(u, 8,4)
+SPLIT(u,16,2)
+SPLIT(u,16,3)
+SPLIT(u,16,4)
+SPLIT(s,32,2)
+SPLIT(s,32,3)
+SPLIT(s,32,4)
+
+SPLIT64(s, 2)
+SPLIT64(s, 3)
+SPLIT64(s, 4)
+
+SPLIT4ALPHA(u,8)
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/channels_combine.cpp
+++ b/3rdparty/carotene/src/channels_combine.cpp
@ -0,0 +1,389 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#define FILL_LINES2(macro,type) \
+            macro##_LINE(type,0) \
+            macro##_LINE(type,1)
+#define FILL_LINES3(macro,type) \
+            FILL_LINES2(macro,type) \
+            macro##_LINE(type,2)
+#define FILL_LINES4(macro,type) \
+            FILL_LINES3(macro,type) \
+            macro##_LINE(type,3)
+
+#define  FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
+
+#ifdef CAROTENE_NEON
+
+#define  VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
+#define  PREF_LINE(type, n) internal::prefetch(src##n + sj);
+#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
+#define  PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
+#define  VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
+#define   SLD_LINE(type, n) dst[dj + n] = src##n[sj];
+
+#define MUL2(val) (val << 1)
+#define MUL3(val) (MUL2(val) + val)
+#define MUL4(val) (val << 2)
+
+#define CONTSRC2 dstStride == src0Stride && \
+                 dstStride == src1Stride &&
+#define CONTSRC3 dstStride == src0Stride && \
+                 dstStride == src1Stride && \
+                 dstStride == src2Stride &&
+#define CONTSRC4 dstStride == src0Stride && \
+                 dstStride == src1Stride && \
+                 dstStride == src2Stride && \
+                 dstStride == src3Stride &&
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define MERGE_ASM2(sgn, bits) __asm__ ( \
+                                          "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
+                                          "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
+                                          "vst2." #bits " {d0, d2}, [%[out0]]           \n\t" \
+                                          "vst2." #bits " {d1, d3}, [%[out1]]           \n\t" \
+                                          : \
+                                          : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
+                                            [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3" \
+                                      );
+#define MERGE_ASM3(sgn, bits) __asm__ ( \
+                                          "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
+                                          "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
+                                          "vld1." #bits " {d4-d5}, [%[in2]]             \n\t" \
+                                          "vst3." #bits " {d0, d2, d4}, [%[out0]]       \n\t" \
+                                          "vst3." #bits " {d1, d3, d5}, [%[out1]]       \n\t" \
+                                          : \
+                                          : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
+                                            [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3","d4","d5" \
+                                      );
+#define MERGE_ASM4(sgn, bits) __asm__ ( \
+                                          "vld1." #bits " {d0-d1}, [%[in0]]             \n\t" \
+                                          "vld1." #bits " {d2-d3}, [%[in1]]             \n\t" \
+                                          "vld1." #bits " {d4-d5}, [%[in2]]             \n\t" \
+                                          "vld1." #bits " {d6-d7}, [%[in3]]             \n\t" \
+                                          "vst4." #bits " {d0, d2, d4, d6}, [%[out0]]   \n\t" \
+                                          "vst4." #bits " {d1, d3, d5, d7}, [%[out1]]   \n\t" \
+                                          : \
+                                          : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
+                                            [out0]  "r" (dst + dj), [out1]  "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
+                                          : "d0","d1","d2","d3","d4","d5","d6","d7" \
+                                      );
+
+#define MERGE_QUAD(sgn, bits, n) { \
+                                     FILL_LINES##n(PREF, sgn##bits) \
+                                     MERGE_ASM##n(sgn, bits) \
+                                 }
+
+#else
+
+#define MERGE_QUAD(sgn, bits, n) { \
+                                     vec128 v_dst; \
+                                     /*FILL_LINES##n(PREF, sgn##bits) \
+                                     FILL_LINES##n(VLD1Q, sgn##bits)*/ \
+                                     FILL_LINES##n(PRLD, sgn##bits) \
+                                     vst##n##q_##sgn##bits(dst + dj, v_dst); \
+                                 }
+
+#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
+
+#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size                                             \
+                                        FILL_LINES##n(FARG, sgn##bits),                                     \
+                                        sgn##bits * dstBase, ptrdiff_t dstStride)                           \
+{                                                                                                           \
+    internal::assertSupportedConfiguration();                                                               \
+    Size2D size(_size);                                                                                     \
+    if (CONTSRC##n                                                                                          \
+        dstStride == (ptrdiff_t)(size.width))                                                               \
+    {                                                                                                       \
+        size.width *= size.height;                                                                          \
+        size.height = 1;                                                                                    \
+    }                                                                                                       \
+    typedef internal::VecTraits<sgn##bits, n>::vec128 vec128;                                               \
+    size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
+    typedef internal::VecTraits<sgn##bits, n>::vec64 vec64;                                                 \
+    size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0;    \
+                                                                                                            \
+    for (size_t i = 0u; i < size.height; ++i)                                                               \
+    {                                                                                                       \
+        FILL_LINES##n(VROW, sgn##bits)                                                                      \
+        sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i);                                       \
+        size_t sj = 0u, dj = 0u;                                                                            \
+                                                                                                            \
+        for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits))                 \
+            MERGE_QUAD(sgn, bits, n)                                                                        \
+                                                                                                            \
+        if ( sj < roiw8 )                                                                                   \
+        {                                                                                                   \
+            vec64 v_dst;                                                                                    \
+            FILL_LINES##n(VLD1, sgn##bits)                                                                  \
+            vst##n##_##sgn##bits(dst + dj, v_dst);                                                          \
+            sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits);                                   \
+        }                                                                                                   \
+                                                                                                            \
+        for (; sj < size.width; ++sj, dj += n)                                                              \
+        {                                                                                                   \
+            FILL_LINES##n(SLD, sgn##bits)                                                                   \
+        }                                                                                                   \
+    }                                                                                                       \
+}
+
+#define COMBINE64(sgn,n) void combine##n(const Size2D &_size                                                \
+                                               FILL_LINES##n(FARG, sgn##64),                                \
+                                               sgn##64 * dstBase, ptrdiff_t dstStride)                      \
+{                                                                                                           \
+    internal::assertSupportedConfiguration();                                                               \
+    Size2D size(_size);                                                                                     \
+    if (CONTSRC##n                                                                                          \
+        dstStride == (ptrdiff_t)(size.width))                                                               \
+    {                                                                                                       \
+        size.width *= size.height;                                                                          \
+        size.height = 1;                                                                                    \
+    }                                                                                                       \
+    typedef internal::VecTraits<sgn##64, n>::vec64 vec64;                                                   \
+                                                                                                            \
+    for (size_t i = 0u; i < size.height; ++i)                                                               \
+    {                                                                                                       \
+        FILL_LINES##n(VROW, sgn##64)                                                                        \
+        sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i);                                         \
+        size_t sj = 0u, dj = 0u;                                                                            \
+                                                                                                            \
+        for (; sj < size.width; ++sj, dj += n)                                                              \
+        {                                                                                                   \
+            vec64 v_dst;                                                                                    \
+            FILL_LINES##n(VLD1, sgn##64)                                                                    \
+            vst##n##_##sgn##64(dst + dj, v_dst);                                                            \
+            /*FILL_LINES##n(SLD, sgn##64)*/                                                                 \
+        }                                                                                                   \
+    }                                                                                                       \
+}
+
+#else
+
+#define  VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
+
+#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size                                              \
+                                        FILL_LINES##n(FARG, sgn##bits),                                     \
+                                        sgn##bits * dstBase, ptrdiff_t dstStride)                           \
+{                                                                                                           \
+    internal::assertSupportedConfiguration();                                                               \
+    (void)size;                                                                                             \
+    FILL_LINES##n(VOID, sgn##bits)                                                                          \
+    (void)dstBase;                                                                                          \
+    (void)dstStride;                                                                                        \
+}
+#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
+
+#endif //CAROTENE_NEON
+
+COMBINE(u, 8,2)
+COMBINE(u, 8,3)
+COMBINE(u, 8,4)
+COMBINE(u,16,2)
+COMBINE(u,16,3)
+COMBINE(u,16,4)
+COMBINE(s,32,2)
+COMBINE(s,32,3)
+COMBINE(s,32,4)
+COMBINE64(s, 2)
+COMBINE64(s, 3)
+COMBINE64(s, 4)
+
+void combineYUYV(const Size2D &size,
+                 const u8 * srcyBase, ptrdiff_t srcyStride,
+                 const u8 * srcuBase, ptrdiff_t srcuStride,
+                 const u8 * srcvBase, ptrdiff_t srcvStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; i += 1)
+    {
+        const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
+        const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
+        const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t syj = 0u, sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
+        {
+            internal::prefetch(srcy + syj);
+            internal::prefetch(srcu + sj);
+            internal::prefetch(srcv + sj);
+
+            uint8x16x2_t v_y = vld2q_u8(srcy + syj);
+            uint8x16x4_t v_dst;
+            v_dst.val[0] = v_y.val[0];
+            v_dst.val[1] = vld1q_u8(srcu + sj);
+            v_dst.val[2] = v_y.val[1];
+            v_dst.val[3] = vld1q_u8(srcv + sj);
+            vst4q_u8(dst + dj, v_dst);
+
+            v_y = vld2q_u8(srcy + syj + 32);
+            v_dst.val[0] = v_y.val[0];
+            v_dst.val[1] = vld1q_u8(srcu + sj + 16);
+            v_dst.val[2] = v_y.val[1];
+            v_dst.val[3] = vld1q_u8(srcv + sj + 16);
+            vst4q_u8(dst + dj + 64, v_dst);
+        }
+#endif
+
+        for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
+        {
+            uint8x8x2_t v_y = vld2_u8(srcy + syj);
+            uint8x8x4_t v_dst;
+            v_dst.val[0] = v_y.val[0];
+            v_dst.val[1] = vld1_u8(srcu + sj);
+            v_dst.val[2] = v_y.val[1];
+            v_dst.val[3] = vld1_u8(srcv + sj);
+            vst4_u8(dst + dj, v_dst);
+        }
+
+        for (; sj < size.width; ++sj, syj += 2, dj += 4)
+        {
+            dst[dj] = srcy[syj];
+            dst[dj + 1] = srcu[sj];
+            dst[dj + 2] = srcy[syj + 1];
+            dst[dj + 3] = srcv[sj];
+        }
+    }
+#else
+    (void)size;
+    (void)srcyBase;
+    (void)srcyStride;
+    (void)srcuBase;
+    (void)srcuStride;
+    (void)srcvBase;
+    (void)srcvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void combineUYVY(const Size2D &size,
+                 const u8 * srcyBase, ptrdiff_t srcyStride,
+                 const u8 * srcuBase, ptrdiff_t srcuStride,
+                 const u8 * srcvBase, ptrdiff_t srcvStride,
+                 u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+#ifndef ANDROID
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+#endif
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0u; i < size.height; ++i)
+    {
+        const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
+        const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
+        const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t syj = 0u, sj = 0u, dj = 0u;
+
+#ifndef ANDROID
+        for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
+        {
+            internal::prefetch(srcy + syj);
+            internal::prefetch(srcu + sj);
+            internal::prefetch(srcv + sj);
+
+            uint8x16x2_t v_y = vld2q_u8(srcy + syj);
+            uint8x16x4_t v_dst;
+            v_dst.val[0] = vld1q_u8(srcu + sj);
+            v_dst.val[1] = v_y.val[0];
+            v_dst.val[2] = vld1q_u8(srcv + sj);
+            v_dst.val[3] = v_y.val[1];
+            vst4q_u8(dst + dj, v_dst);
+
+            v_y = vld2q_u8(srcy + syj + 32);
+            v_dst.val[0] = vld1q_u8(srcu + sj + 16);
+            v_dst.val[1] = v_y.val[0];
+            v_dst.val[2] = vld1q_u8(srcv + sj + 16);
+            v_dst.val[3] = v_y.val[1];
+            vst4q_u8(dst + dj + 64, v_dst);
+        }
+#endif
+
+        for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
+        {
+            uint8x8x2_t v_y = vld2_u8(srcy + syj);
+            uint8x8x4_t v_dst;
+            v_dst.val[0] = vld1_u8(srcu + sj);
+            v_dst.val[1] = v_y.val[0];
+            v_dst.val[2] = vld1_u8(srcv + sj);
+            v_dst.val[3] = v_y.val[1];
+            vst4_u8(dst + dj, v_dst);
+        }
+
+        for (; sj < size.width; ++sj, syj += 2, dj += 4)
+        {
+            dst[dj] = srcu[sj];
+            dst[dj + 1] = srcy[syj];
+            dst[dj + 2] = srcv[sj];
+            dst[dj + 3] = srcy[syj + 1];
+        }
+    }
+#else
+    (void)size;
+    (void)srcyBase;
+    (void)srcyStride;
+    (void)srcuBase;
+    (void)srcuStride;
+    (void)srcvBase;
+    (void)srcvStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/cmp.cpp
+++ b/3rdparty/carotene/src/cmp.cpp
@ -0,0 +1,340 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
+inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
+inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
+
+template <typename Op, int elsize> struct vtail
+{
+    static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
+                               u8 * dst, const Op & op,
+                               size_t &x, size_t width)
+    {
+        //do nothing since there couldn't be enough data
+        (void)src0;
+        (void)src1;
+        (void)dst;
+        (void)op;
+        (void)x;
+        (void)width;
+    }
+};
+template <typename Op> struct vtail<Op, 2>
+{
+    static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
+                               u8 * dst, const Op & op,
+                               size_t &x, size_t width)
+    {
+        typedef typename Op::type type;
+        typedef typename internal::VecTraits<type>::vec128 vec128;
+        typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
+        //There no more than 15 elements in the tail, so we could handle 8 element vector only once
+        if( x + 8 < width)
+        {
+            vec128  v_src0, v_src1;
+            uvec128 v_dst;
+
+            v_src0 = internal::vld1q(src0 + x);
+            v_src1 = internal::vld1q(src1 + x);
+            op(v_src0, v_src1, v_dst);
+            internal::vst1(dst + x, internal::vmovn(v_dst));
+            x+=8;
+        }
+    }
+};
+template <typename Op> struct vtail<Op, 1>
+{
+    static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
+                               u8 * dst, const Op & op,
+                               size_t &x, size_t width)
+    {
+        typedef typename Op::type type;
+        typedef typename internal::VecTraits<type>::vec128 vec128;
+        typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
+        typedef typename internal::VecTraits<type>::vec64 vec64;
+        typedef typename internal::VecTraits<type>::unsign::vec64 uvec64;
+        //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
+        if( x + 16 < width)
+        {
+            vec128  v_src0, v_src1;
+            uvec128 v_dst;
+
+            v_src0 = internal::vld1q(src0 + x);
+            v_src1 = internal::vld1q(src1 + x);
+            op(v_src0, v_src1, v_dst);
+            internal::vst1q(dst + x, v_dst);
+            x+=16;
+        }
+        if( x + 8 < width)
+        {
+            vec64  v_src0, v_src1;
+            uvec64 v_dst;
+
+            v_src0 = internal::vld1(src0 + x);
+            v_src1 = internal::vld1(src1 + x);
+            op(v_src0, v_src1, v_dst);
+            internal::vst1(dst + x, v_dst);
+            x+=8;
+        }
+    }
+};
+
+template <typename Op>
+void vcompare(Size2D size,
+              const typename Op::type * src0Base, ptrdiff_t src0Stride,
+              const typename Op::type * src1Base, ptrdiff_t src1Stride,
+              u8 * dstBase, ptrdiff_t dstStride, const Op & op)
+{
+    typedef typename Op::type type;
+    typedef typename internal::VecTraits<type>::vec128 vec128;
+    typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
+
+    if (src0Stride == src1Stride && src0Stride == dstStride &&
+        src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+    const u32 step_base = 32 / sizeof(type);
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
+        const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
+        size_t x = 0;
+
+        for( ; x < roiw_base; x += step_base )
+        {
+            internal::prefetch(src0 + x);
+            internal::prefetch(src1 + x);
+
+            vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type));
+            vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type));
+            uvec128 v_dst0;
+            uvec128 v_dst1;
+
+            op(v_src00, v_src10, v_dst0);
+            op(v_src01, v_src11, v_dst1);
+
+            vnst(dst + x, v_dst0, v_dst1);
+        }
+
+        vtail<Op, sizeof(type)>::compare(src0, src1, dst, op, x, size.width);
+
+        for (; x < size.width; ++x)
+        {
+            op(src0 + x, src1 + x, dst + x);
+        }
+    }
+}
+
+template<typename T>
+struct OpCmpEQ
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
+    {
+        v_dst = internal::vceqq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
+    {
+        v_dst = internal::vceq(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] == src1[0] ? 255 : 0;
+    }
+};
+
+template<typename T>
+struct OpCmpNE
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
+    {
+        v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1));
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
+    {
+        v_dst = internal::vmvn(internal::vceq(v_src0, v_src1));
+    }
+
+    void operator() (const T * src0, const T * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] == src1[0] ? 0 : 255;
+    }
+};
+
+template<typename T>
+struct OpCmpGT
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
+    {
+        v_dst = internal::vcgtq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
+    {
+        v_dst = internal::vcgt(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] > src1[0] ? 255 : 0;
+    }
+};
+
+template<typename T>
+struct OpCmpGE
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
+    {
+        v_dst = internal::vcgeq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
+              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
+    {
+        v_dst = internal::vcge(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, u8 * dst) const
+    {
+        dst[0] = src0[0] >= src1[0] ? 255 : 0;
+    }
+};
+
+}
+
+#define IMPL_CMPOP(op, type)                              \
+void cmp##op(const Size2D &size,                          \
+             const type * src0Base, ptrdiff_t src0Stride, \
+             const type * src1Base, ptrdiff_t src1Stride, \
+                       u8 *dstBase, ptrdiff_t dstStride)  \
+{                                                         \
+    internal::assertSupportedConfiguration();             \
+    vcompare(size,                                        \
+             src0Base, src0Stride,                        \
+             src1Base, src1Stride,                        \
+             dstBase, dstStride,                          \
+             OpCmp##op<type>());                          \
+}
+
+#else
+
+#define IMPL_CMPOP(op, type)                              \
+void cmp##op(const Size2D &size,                          \
+             const type * src0Base, ptrdiff_t src0Stride, \
+             const type * src1Base, ptrdiff_t src1Stride, \
+             u8 *dstBase, ptrdiff_t dstStride)            \
+{                                                         \
+    internal::assertSupportedConfiguration();             \
+    (void)size;                                           \
+    (void)src0Base;                                       \
+    (void)src0Stride;                                     \
+    (void)src1Base;                                       \
+    (void)src1Stride;                                     \
+    (void)dstBase;                                        \
+    (void)dstStride;                                      \
+}
+
+#endif
+
+IMPL_CMPOP(EQ, u8)
+IMPL_CMPOP(EQ, s8)
+IMPL_CMPOP(EQ, u16)
+IMPL_CMPOP(EQ, s16)
+IMPL_CMPOP(EQ, u32)
+IMPL_CMPOP(EQ, s32)
+IMPL_CMPOP(EQ, f32)
+
+IMPL_CMPOP(NE, u8)
+IMPL_CMPOP(NE, s8)
+IMPL_CMPOP(NE, u16)
+IMPL_CMPOP(NE, s16)
+IMPL_CMPOP(NE, u32)
+IMPL_CMPOP(NE, s32)
+IMPL_CMPOP(NE, f32)
+
+IMPL_CMPOP(GT, u8)
+IMPL_CMPOP(GT, s8)
+IMPL_CMPOP(GT, u16)
+IMPL_CMPOP(GT, s16)
+IMPL_CMPOP(GT, u32)
+IMPL_CMPOP(GT, s32)
+IMPL_CMPOP(GT, f32)
+
+IMPL_CMPOP(GE, u8)
+IMPL_CMPOP(GE, s8)
+IMPL_CMPOP(GE, u16)
+IMPL_CMPOP(GE, s16)
+IMPL_CMPOP(GE, u32)
+IMPL_CMPOP(GE, s32)
+IMPL_CMPOP(GE, f32)
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/colorconvert.cpp
+++ b/3rdparty/carotene/src/colorconvert.cpp
--- a/3rdparty/carotene/src/common.cpp
+++ b/3rdparty/carotene/src/common.cpp
@ -0,0 +1,108 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <cstdlib>
+#include <iostream>
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+bool isSupportedConfiguration()
+{
+#ifdef CAROTENE_NEON
+    return true;
+#else
+    return false;
+#endif
+}
+
+namespace internal {
+
+void assertSupportedConfiguration(bool parametersSupported)
+{
+    if (!isSupportedConfiguration()) {
+        std::cerr << "internal error: attempted to use an unavailable function" << std::endl;
+        std::abort();
+    }
+
+    if (!parametersSupported) {
+        std::cerr << "internal error: attempted to use a function with unsupported parameters" << std::endl;
+        std::abort();
+    }
+}
+
+ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin, size_t endMargin)
+{
+    ptrdiff_t p = _p + (ptrdiff_t)startMargin;
+    size_t len = _len + startMargin + endMargin;
+    if( (size_t)p < len )
+        return _p;
+    else if( borderType == BORDER_MODE_REPLICATE )
+        p = p < 0 ? 0 : (ptrdiff_t)len - 1;
+    else if( borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REFLECT101 )
+    {
+        s32 delta = borderType == BORDER_MODE_REFLECT101;
+        if( len == 1 )
+            return 0;
+        do
+        {
+            if( p < 0 )
+                p = -p - 1 + delta;
+            else
+                p = (ptrdiff_t)len - 1 - (p - (ptrdiff_t)len) - delta;
+        }
+        while( (size_t)p >= len );
+    }
+    else if( borderType == BORDER_MODE_WRAP )
+    {
+        if( p < 0 )
+            p -= ((p-(ptrdiff_t)len+1)/(ptrdiff_t)len)*(ptrdiff_t)len;
+        if( p >= (ptrdiff_t)len )
+            p %= (ptrdiff_t)len;
+    }
+    else if( borderType == BORDER_MODE_CONSTANT )
+        p = -1;
+    else
+        internal::assertSupportedConfiguration(false);
+    return p - (ptrdiff_t)startMargin;
+}
+
+} // namespace internal
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/common.hpp
+++ b/3rdparty/carotene/src/common.hpp
@ -0,0 +1,96 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SRC_COMMON_HPP
+#define CAROTENE_SRC_COMMON_HPP
+
+#include <cstddef>
+#include <algorithm>
+
+#if defined WITH_NEON && (defined __ARM_NEON__ || defined __ARM_NEON)
+#define CAROTENE_NEON
+#endif
+
+#ifdef CAROTENE_NEON
+#include <arm_neon.h>
+#include "intrinsics.hpp"
+#endif
+
+#include <carotene/functions.hpp>
+#include "saturate_cast.hpp"
+
+namespace CAROTENE_NS { namespace internal {
+
+inline void prefetch(const void *ptr, size_t offset = 32*10)
+{
+#if defined __GNUC__
+    __builtin_prefetch(reinterpret_cast<const char*>(ptr) + offset);
+#elif defined _MSC_VER && defined CAROTENE_NEON
+    __prefetch(reinterpret_cast<const char*>(ptr) + offset);
+#else
+    (void)ptr;
+    (void)offset;
+#endif
+}
+
+template <typename T>
+inline T *getRowPtr(T *base, ptrdiff_t stride, size_t row)
+{
+    char *baseRaw = const_cast<char *>(reinterpret_cast<const char *>(base));
+    return reinterpret_cast<T *>(baseRaw + ptrdiff_t(row) * stride);
+}
+
+void assertSupportedConfiguration(bool parametersSupported = true);
+
+ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin = 0, size_t endMargin = 0);
+
+/*!
+ *  Aligns pointer by the certain number of bytes
+ *
+ *  This small inline function aligns the pointer by the certain number of bytes by shifting
+ *  it forward by 0 or a positive offset.
+ */
+template<typename T> inline T* alignPtr(T* ptr, size_t n=sizeof(T))
+{
+    return (T*)(((size_t)ptr + n-1) & -n);
+}
+
+}}
+
+#endif
--- a/3rdparty/carotene/src/convert.cpp
+++ b/3rdparty/carotene/src/convert.cpp
--- a/3rdparty/carotene/src/convert_depth.cpp
+++ b/3rdparty/carotene/src/convert_depth.cpp
@ -0,0 +1,399 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <int shift>
+void lshiftConst(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 s16 * dstBase, ptrdiff_t dstStride)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+
+            vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift));
+            vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
+            vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift));
+        }
+
+        for (; j < size.width; j++)
+        {
+            dst[j] = ((s16)src[j] << shift);
+        }
+    }
+}
+
+template <>
+void lshiftConst<0>(const Size2D &size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    s16 * dstBase, ptrdiff_t dstStride)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+            int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+            int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+
+            vst1q_s16(dst + j, v_dst0);
+            vst1q_s16(dst + j + 8, v_dst1);
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
+            vst1q_s16(dst + j, v_dst);
+        }
+
+        for (; j < size.width; j++)
+        {
+            dst[j] = (s16)src[j];
+        }
+    }
+}
+
+template <int shift>
+void rshiftConst(const Size2D &size,
+                 const s16 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 CONVERT_POLICY cpolicy)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src + j);
+                int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
+                          v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
+                uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0),
+                                               vqmovun_s16(v_src1));
+                vst1q_u8(dst + j, v_dst);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
+                vst1_u8(dst + j, vqmovun_s16(v_src));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = internal::saturate_cast<u8>((src[j] >> shift));
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src + j);
+                int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
+                          v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
+                int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0),
+                                              vmovn_s16(v_src1));
+                vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
+                vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = (u8)((src[j] >> shift));
+            }
+        }
+    }
+}
+
+template <>
+void rshiftConst<0>(const Size2D &size,
+                    const s16 * srcBase, ptrdiff_t srcStride,
+                    u8 * dstBase, ptrdiff_t dstStride,
+                    CONVERT_POLICY cpolicy)
+{
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src + j);
+                int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
+                uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1));
+                vst1q_u8(dst + j, v_dst);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src = vld1q_s16(src + j);
+                vst1_u8(dst + j, vqmovun_s16(v_src));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = internal::saturate_cast<u8>(src[j]);
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src + j);
+                int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
+                int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1));
+                vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src = vld1q_s16(src + j);
+                vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = (u8)src[j];
+            }
+        }
+    }
+}
+
+typedef void (* lshiftConstFunc)(const Size2D &size,
+                                const u8 * srcBase, ptrdiff_t srcStride,
+                                s16 * dstBase, ptrdiff_t dstStride);
+
+typedef void (* rshiftConstFunc)(const Size2D &size,
+                                const s16 * srcBase, ptrdiff_t srcStride,
+                                u8 * dstBase, ptrdiff_t dstStride,
+                                CONVERT_POLICY cpolicy);
+
+} // namespace
+
+#endif
+
+void lshift(const Size2D &size,
+            const u8 * srcBase, ptrdiff_t srcStride,
+            s16 * dstBase, ptrdiff_t dstStride,
+            u32 shift)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    if (shift >= 16u)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            std::memset(dst, 0, sizeof(s16) * size.width);
+        }
+        return;
+    }
+
+    // this ugly contruction is needed to avoid:
+    // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
+    // return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
+
+    lshiftConstFunc funcs[16] =
+    {
+        lshiftConst<0>,
+        lshiftConst<1>,
+        lshiftConst<2>,
+        lshiftConst<3>,
+        lshiftConst<4>,
+        lshiftConst<5>,
+        lshiftConst<6>,
+        lshiftConst<7>,
+        lshiftConst<8>,
+        lshiftConst<9>,
+        lshiftConst<10>,
+        lshiftConst<11>,
+        lshiftConst<12>,
+        lshiftConst<13>,
+        lshiftConst<14>,
+        lshiftConst<15>
+    }, func = funcs[shift];
+
+    func(size, srcBase, srcStride, dstBase, dstStride);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)shift;
+#endif
+}
+
+void rshift(const Size2D &size,
+            const s16 * srcBase, ptrdiff_t srcStride,
+            u8 * dstBase, ptrdiff_t dstStride,
+            u32 shift, CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    if (shift >= 16)
+    {
+        if (cpolicy == CONVERT_POLICY_WRAP)
+        {
+            size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+            size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+            int16x8_t v_zero = vdupq_n_s16(0);
+
+            for (size_t i = 0; i < size.height; ++i)
+            {
+                const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
+                u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+                size_t j = 0;
+
+                for (; j < roiw16; j += 16)
+                {
+                    internal::prefetch(src + j);
+                    int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
+                    uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)),
+                                                   vmovn_u16(vcltq_s16(v_src1, v_zero)));
+                    vst1q_u8(dst + j, v_dst);
+                }
+                for (; j < roiw8; j += 8)
+                {
+                    int16x8_t v_src = vld1q_s16(src + j);
+                    vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero)));
+                }
+
+                for (; j < size.width; j++)
+                {
+                    dst[j] = src[j] >= 0 ? 0 : 255;
+                }
+            }
+        }
+        else
+        {
+            for (size_t i = 0; i < size.height; ++i)
+            {
+                u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+                std::memset(dst, 0, sizeof(u8) * size.width);
+            }
+        }
+        return;
+    }
+
+    // this ugly contruction is needed to avoid:
+    // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
+    // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
+
+    rshiftConstFunc funcs[16] =
+    {
+        rshiftConst<0>,
+        rshiftConst<1>,
+        rshiftConst<2>,
+        rshiftConst<3>,
+        rshiftConst<4>,
+        rshiftConst<5>,
+        rshiftConst<6>,
+        rshiftConst<7>,
+        rshiftConst<8>,
+        rshiftConst<9>,
+        rshiftConst<10>,
+        rshiftConst<11>,
+        rshiftConst<12>,
+        rshiftConst<13>,
+        rshiftConst<14>,
+        rshiftConst<15>
+    }, func = funcs[shift];
+
+    func(size, srcBase, srcStride, dstBase, dstStride, cpolicy);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)shift;
+    (void)cpolicy;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/convert_scale.cpp
+++ b/3rdparty/carotene/src/convert_scale.cpp
--- a/3rdparty/carotene/src/convolution.cpp
+++ b/3rdparty/carotene/src/convolution.cpp
@ -0,0 +1,340 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+
+namespace CAROTENE_NS {
+
+bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
+                            BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 8 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE) &&
+        (ksize.width == 3) && (ksize.height == 3);
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <int shift>
+int32x4_t vshrq_s32(int32x4_t value)
+{
+    return vshrq_n_s32(value, shift);
+}
+
+template <>
+int32x4_t vshrq_s32<0>(int32x4_t value)
+{
+    return value;
+}
+
+} // namespace
+
+typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
+
+#endif
+
+void convolution(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE border, u8 borderValue,
+                 const Size2D & ksize, s16 * kernelBase, u32 scale)
+{
+    internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
+#ifdef CAROTENE_NEON
+    const uint8x8_t v_zero_u8 = vdup_n_u8(0);
+    const uint8x8_t v_border = vdup_n_u8(borderValue);
+    const int32x4_t v_zero_s32 = vdupq_n_s32(0);
+
+    uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
+              tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
+              tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
+    uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
+
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+    static const vshrq_s32_func vshrq_s32_a[33] =
+    {
+        vshrq_s32<0>,
+        vshrq_s32<1>,
+        vshrq_s32<2>,
+        vshrq_s32<3>,
+        vshrq_s32<4>,
+        vshrq_s32<5>,
+        vshrq_s32<6>,
+        vshrq_s32<7>,
+        vshrq_s32<8>,
+        vshrq_s32<9>,
+        vshrq_s32<10>,
+        vshrq_s32<11>,
+        vshrq_s32<12>,
+        vshrq_s32<13>,
+        vshrq_s32<14>,
+        vshrq_s32<15>,
+        vshrq_s32<16>,
+        vshrq_s32<17>,
+        vshrq_s32<18>,
+        vshrq_s32<19>,
+        vshrq_s32<20>,
+        vshrq_s32<21>,
+        vshrq_s32<22>,
+        vshrq_s32<23>,
+        vshrq_s32<24>,
+        vshrq_s32<25>,
+        vshrq_s32<26>,
+        vshrq_s32<27>,
+        vshrq_s32<28>,
+        vshrq_s32<29>,
+        vshrq_s32<30>,
+        vshrq_s32<31>,
+        vshrq_s32<32>
+    };
+    vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        u8 prevx[3] = { 0, 0, 0 },
+           currx[3] = { 0, 0, 0 },
+           nextx[3] = { 0, 0, 0 };
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 8)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 8 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx[0] = prevx[1] = prevx[2] = borderValue;
+                else
+                {
+                    prevx[0] = srow0 ? srow0[x4] : borderValue;
+                    prevx[1] =         srow1[x4]              ;
+                    prevx[2] = srow2 ? srow2[x4] : borderValue;
+                }
+
+                currx[0] = srow0 ? srow0[x3] : borderValue;
+                currx[1] =         srow1[x3]              ;
+                currx[2] = srow2 ? srow2[x3] : borderValue;
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev[0] = tcurr[0];
+                tcurr[0] = tnext[0];
+
+                tprev[1] = tcurr[1];
+                tcurr[1] = tnext[1];
+
+                tprev[2] = tcurr[2];
+                tcurr[2] = tnext[2];
+            }
+
+            tnext[0] = x0;
+            tnext[1] = x1;
+            tnext[2] = x2;
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr[0] = tcurr[1] = tcurr[2] = v_border;
+                else if (border == BORDER_MODE_REPLICATE)
+                {
+                    tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
+                    tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
+                    tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
+                }
+
+                continue;
+            }
+
+            int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
+
+            {
+                // combine 3 "shifted" vectors
+                t0 = vext_u8(tprev[0], tcurr[0], 7);
+                t1 = tcurr[0];
+                t2 = vext_u8(tcurr[0], tnext[0], 1);
+
+                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
+                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
+                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
+
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
+            }
+
+            {
+                // combine 3 "shifted" vectors
+                t0 = vext_u8(tprev[1], tcurr[1], 7);
+                t1 = tcurr[1];
+                t2 = vext_u8(tcurr[1], tnext[1], 1);
+
+                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
+                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
+                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
+
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
+            }
+
+            {
+                // combine 3 "shifted" vectors
+                t0 = vext_u8(tprev[2], tcurr[2], 7);
+                t1 = tcurr[2];
+                t2 = vext_u8(tcurr[2], tnext[2], 1);
+
+                int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
+                int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
+                int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
+                v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
+
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
+                v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
+            }
+
+
+            // make scale
+            v_dst0 = vshrq_s32_p(v_dst0);
+            v_dst1 = vshrq_s32_p(v_dst1);
+
+            // and add them
+            vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
+                                                          vqmovun_s32(v_dst1))));
+        }
+
+        x -= 8;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                {
+                    nextx[0] = borderValue;
+                    nextx[1] = borderValue;
+                    nextx[2] = borderValue;
+                }
+                else if (border == BORDER_MODE_REPLICATE)
+                {
+                    nextx[0] = srow0[x];
+                    nextx[1] = srow1[x];
+                    nextx[2] = srow2[x];
+                }
+            }
+            else
+            {
+                nextx[0] = srow0 ? srow0[x + 1] : borderValue;
+                nextx[1] =         srow1[x + 1]              ;
+                nextx[2] = srow2 ? srow2[x + 1] : borderValue;
+            }
+
+            s32 val = 0;
+            for (s32 _y = 0; _y < 3; ++_y)
+                val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
+                       currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
+                       nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
+
+            drow[x] = internal::saturate_cast<u8>(val >> scale);
+
+            // make shift
+            prevx[0] = currx[0];
+            currx[0] = nextx[0];
+
+            prevx[1] = currx[1];
+            currx[1] = nextx[1];
+
+            prevx[2] = currx[2];
+            currx[2] = nextx[2];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+    (void)ksize;
+    (void)kernelBase;
+    (void)scale;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/count_nonzero.cpp
+++ b/3rdparty/carotene/src/count_nonzero.cpp
@ -0,0 +1,430 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <limits>
+
+namespace CAROTENE_NS {
+
+s32 countNonZero(const Size2D &_size,
+                 const u8 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw16 = size.width & ~15u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        #define COUNTNONZERO8U_BLOCK_SIZE (16*255)
+        uint8x16_t vc1 = vmovq_n_u8(1);
+        for (; i < roiw16;)
+        {
+            size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
+            uint8x16_t vs = vmovq_n_u8(0);
+
+            for (; i <= lim; i+= 16)
+            {
+                internal::prefetch(src + i);
+                uint8x16_t vln = vld1q_u8(src + i);
+                uint8x16_t vnz = vminq_u8(vln, vc1);
+                vs = vaddq_u8(vs, vnz);
+            }
+
+            uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
+            uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
+
+            s32 s[2];
+            vst1_u32((u32*)s, vs2);
+
+            if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
+            {
+                return 0x7fFFffFF;
+            }
+            result += (s[0] += s[1]);
+            if (s[0] < 0 || result < 0)
+            {
+                return 0x7fFFffFF;
+            }
+        }
+        for (; i < size.width; i++)
+            result += (src[i] != 0)?1:0;
+        if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const u16 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width & ~7u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        #define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
+        uint16x8_t vc1 = vmovq_n_u16(1);
+        for (; i < roiw8;)
+        {
+            size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
+            uint16x8_t vs = vmovq_n_u16(0);
+
+            for (; i <= lim; i+= 8)
+            {
+                internal::prefetch(src + i);
+                uint16x8_t vln = vld1q_u16(src + i);
+                uint16x8_t vnz = vminq_u16(vln, vc1);
+                vs = vaddq_u16(vs, vnz);
+            }
+
+            uint32x4_t vs4 = vpaddlq_u16(vs);
+            uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
+
+            s32 s[2];
+            vst1_u32((u32*)s, vs2);
+
+            if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
+            {
+                return 0x7fFFffFF;
+            }
+            result += (s[0] += s[1]);
+            if (s[0] < 0 || result < 0)
+            {
+                return 0x7fFFffFF;
+            }
+        }
+        for (; i < size.width; i++)
+            result += (src[i] != 0)?1:0;
+        if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const s32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width & ~3u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u32* src = (const u32*)internal::getRowPtr( srcBase,  srcStride, k);
+        u32 i = 0;
+
+        uint32x4_t vc1 = vmovq_n_u32(1);
+        uint32x4_t vs = vmovq_n_u32(0);
+
+        for (; i < roiw4; i += 4 )
+        {
+            internal::prefetch(src + i);
+            uint32x4_t vln = vld1q_u32(src + i);
+            uint32x4_t vnz = vminq_u32(vln, vc1);
+            vs = vqaddq_u32(vs, vnz);
+        }
+
+        uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
+
+        s32 s[2];
+        vst1_u32((u32*)s, vs2);
+
+        if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+        result += (s[0] += s[1]);
+        if (s[0] < 0 || result < 0)
+        {
+            return 0x7fFFffFF;
+        }
+
+        for (; i < size.width; i++)
+            result += (src[i] != 0)?1:0;
+        if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const f32 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw4 = size.width & ~3u;
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        float32x4_t vc0 = vmovq_n_f32(0);
+        int32x4_t vs = vmovq_n_s32(0);
+
+        for (; i < roiw4; i += 4 )
+        {
+            internal::prefetch(src + i);
+            float32x4_t vln = vld1q_f32(src + i);
+            int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
+            vs = vqaddq_s32(vs, vnz);
+        }
+
+        int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
+
+        int s[2];
+        vst1_s32(s, vs2);
+
+        result += (s[0] += s[1]);
+        if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+
+        for (; i < size.width; i++)
+            result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
+
+        if (result < 0)
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+s32 countNonZero(const Size2D &_size,
+                 const f64 * srcBase, ptrdiff_t srcStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    size_t roiw8 = size.width & ~7u;
+    size_t roiw4 = size.width & ~3u;
+    size_t roiw2 = size.width & ~1u;
+    uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
+    uint32x4_t vc0 = vmovq_n_u32(0);
+
+    s32 result = 0;
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f64* src = internal::getRowPtr( srcBase,  srcStride, k);
+        size_t i = 0;
+
+        int32x2_t vs1 = vmov_n_s32(0);
+        int32x2_t vs2 = vmov_n_s32(0);
+        int32x2_t vs3 = vmov_n_s32(0);
+        int32x2_t vs4 = vmov_n_s32(0);
+
+        for (; i < roiw8; i += 8 )
+        {
+            internal::prefetch(src + i + 6);
+            uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
+            uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
+            uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
+            uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
+
+            uint64x2_t vm1 = vandq_u64(vln1, vmask1);
+            uint64x2_t vm2 = vandq_u64(vln2, vmask1);
+            uint64x2_t vm3 = vandq_u64(vln3, vmask1);
+            uint64x2_t vm4 = vandq_u64(vln4, vmask1);
+
+            uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
+            uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
+            uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
+            uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
+
+            uint32x4_t vlx1 = vmvnq_u32(vequ1);
+            uint32x4_t vlx2 = vmvnq_u32(vequ2);
+            uint32x4_t vlx3 = vmvnq_u32(vequ3);
+            uint32x4_t vlx4 = vmvnq_u32(vequ4);
+
+            int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
+            int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
+            int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
+            int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
+
+            vs1 = vqadd_s32(vs1, vnz1);
+            vs2 = vqadd_s32(vs2, vnz2);
+            vs3 = vqadd_s32(vs3, vnz3);
+            vs4 = vqadd_s32(vs4, vnz4);
+        }
+
+        if (i < roiw4)
+        {
+            internal::prefetch(src + i + 2);
+            uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
+            uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
+
+            uint64x2_t vm1 = vandq_u64(vln1, vmask1);
+            uint64x2_t vm2 = vandq_u64(vln2, vmask1);
+
+            uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
+            uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
+
+            uint32x4_t vlx1 = vmvnq_u32(vequ1);
+            uint32x4_t vlx2 = vmvnq_u32(vequ2);
+
+            int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
+            int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
+
+            vs1 = vqadd_s32(vs1, vnz1);
+            vs2 = vqadd_s32(vs2, vnz2);
+            i += 4;
+        }
+
+        if (i < roiw2)
+        {
+            internal::prefetch(src + i);
+            uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
+
+            uint64x2_t vm1 = vandq_u64(vln1, vmask1);
+
+            uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
+
+            uint32x4_t vlx1 = vmvnq_u32(vequ1);
+
+            int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
+
+            vs1 = vqadd_s32(vs1, vnz1);
+            i += 2;
+        }
+
+        vs1 = vqadd_s32(vs1, vs2);
+        vs3 = vqadd_s32(vs3, vs4);
+        vs1 = vqadd_s32(vs1, vs3);
+        int32x2_t vsneg = vqneg_s32(vs1);
+
+        s32 s[2];
+        vst1_s32(s, vsneg);
+
+        result += (s[0] += s[1]);
+        if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
+        {
+            return 0x7fFFffFF;
+        }
+
+        for (; i < size.width; i++)
+            result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
+        if (result < 0)
+        {
+            return 0x7fFFffFF;
+        }
+    }
+    return result;
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+
+    return 0;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/div.cpp
+++ b/3rdparty/carotene/src/div.cpp
@ -0,0 +1,694 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cstring>
+#include <cfloat>
+#include <cmath>
+#include <limits>
+
+namespace CAROTENE_NS {
+
+namespace {
+
+#ifdef CAROTENE_NEON
+
+template <typename T>
+inline T divSaturateQ(const T &v1, const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
+                                                            internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
+                                                            internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
+template <>
+inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
+
+template <typename T>
+inline T divSaturate(const T &v1, const T &v2, const float scale)
+{
+    return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
+template <>
+inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
+
+
+template <typename T>
+inline T divWrapQ(const T &v1, const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
+                                                       internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
+                                                       internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
+template <>
+inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
+
+template <typename T>
+inline T divWrap(const T &v1, const T &v2, const float scale)
+{
+    return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
+template <>
+inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
+
+inline  uint8x16_t vtstq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vtstq_u8 (v0, v1); }
+inline  uint16x8_t vtstq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vtstq_u16(v0, v1); }
+inline  uint32x4_t vtstq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vtstq_u32(v0, v1); }
+inline   int8x16_t vtstq(const int8x16_t   & v0, const int8x16_t   & v1) { return vreinterpretq_s8_u8  (vtstq_s8 (v0, v1)); }
+inline   int16x8_t vtstq(const int16x8_t   & v0, const int16x8_t   & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
+inline   int32x4_t vtstq(const int32x4_t   & v0, const int32x4_t   & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
+
+inline   uint8x8_t vtst(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vtst_u8 (v0, v1); }
+inline  uint16x4_t vtst(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vtst_u16(v0, v1); }
+inline  uint32x2_t vtst(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vtst_u32(v0, v1); }
+inline    int8x8_t vtst(const int8x8_t    & v0, const int8x8_t    & v1) { return vreinterpret_s8_u8  (vtst_s8 (v0, v1)); }
+inline   int16x4_t vtst(const int16x4_t   & v0, const int16x4_t   & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
+inline   int32x2_t vtst(const int32x2_t   & v0, const int32x2_t   & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
+#endif
+
+template <typename T>
+void div(const Size2D &size,
+         const T * src0Base, ptrdiff_t src0Stride,
+         const T * src1Base, ptrdiff_t src1Stride,
+         T * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<T>::vec64 vec64;
+
+    if (scale == 0.0f ||
+        (std::numeric_limits<T>::is_integer &&
+         (scale * std::numeric_limits<T>::max()) <  1.0f &&
+         (scale * std::numeric_limits<T>::max()) > -1.0f))
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            T * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(T) * size.width);
+        }
+        return;
+    }
+
+    const size_t step128 = 16 / sizeof(T);
+    size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
+    const size_t step64 = 8 / sizeof(T);
+    size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        T * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                vec128 v_src0 = internal::vld1q(src0 + j);
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src0 = internal::vld1(src0 + j);
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                vec128 v_src0 = internal::vld1q(src0 + j);
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src0 = internal::vld1(src0 + j);
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+
+template <typename T>
+inline T recipSaturateQ(const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
+template <>
+inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
+
+template <typename T>
+inline T recipSaturate(const T &v2, const float scale)
+{
+    return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
+template <>
+inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
+
+
+template <typename T>
+inline T recipWrapQ(const T &v2, const float scale)
+{
+    return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
+                              internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
+                             );
+}
+template <>
+inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
+{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
+template <>
+inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
+{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
+
+template <typename T>
+inline T recipWrap(const T &v2, const float scale)
+{
+    return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
+}
+template <>
+inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
+{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
+template <>
+inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
+{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
+#endif
+
+template <typename T>
+void recip(const Size2D &size,
+           const T * src1Base, ptrdiff_t src1Stride,
+           T * dstBase, ptrdiff_t dstStride,
+           f32 scale,
+           CONVERT_POLICY cpolicy)
+{
+    internal::assertSupportedConfiguration();
+
+#ifdef CAROTENE_NEON
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<T>::vec64 vec64;
+
+    if (scale == 0.0f ||
+        (std::numeric_limits<T>::is_integer &&
+         scale <  1.0f &&
+         scale > -1.0f))
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            T * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(T) * size.width);
+        }
+        return;
+    }
+
+    const size_t step128 = 16 / sizeof(T);
+    size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
+    const size_t step64 = 8 / sizeof(T);
+    size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        T * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (cpolicy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src1 + j);
+
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
+            }
+        }
+        else // CONVERT_POLICY_WRAP
+        {
+            for (; j < roiw128; j += step128)
+            {
+                internal::prefetch(src1 + j);
+
+                vec128 v_src1 = internal::vld1q(src1 + j);
+
+                vec128 v_mask = vtstq(v_src1,v_src1);
+                internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
+            }
+            for (; j < roiw64; j += step64)
+            {
+                vec64 v_src1 = internal::vld1(src1 + j);
+
+                vec64 v_mask = vtst(v_src1,v_src1);
+                internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
+            }
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)cpolicy;
+    (void)scale;
+#endif
+}
+
+}
+
+void div(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         u8 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const s8 * src0Base, ptrdiff_t src0Stride,
+         const s8 * src1Base, ptrdiff_t src1Stride,
+         s8 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const u16 * src0Base, ptrdiff_t src0Stride,
+         const u16 * src1Base, ptrdiff_t src1Stride,
+         u16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const s32 * src0Base, ptrdiff_t src0Stride,
+         const s32 * src1Base, ptrdiff_t src1Stride,
+         s32 * dstBase, ptrdiff_t dstStride,
+         f32 scale,
+         CONVERT_POLICY cpolicy)
+{
+    div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
+}
+
+void div(const Size2D &size,
+         const f32 * src0Base, ptrdiff_t src0Stride,
+         const f32 * src1Base, ptrdiff_t src1Stride,
+         f32 * dstBase, ptrdiff_t dstStride,
+         f32 scale)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (scale == 0.0f)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(f32) * size.width);
+        }
+        return;
+    }
+
+    float32x4_t v_zero = vdupq_n_f32(0.0f);
+
+    size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
+    size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
+
+    if (std::fabs(scale - 1.0f) < FLT_EPSILON)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+            const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src0 = vld1q_f32(src0 + j);
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
+                vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
+                                   vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src0 = vld1_f32(src0 + j);
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
+                vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
+                                  vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f;
+            }
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+            const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src0 = vld1q_f32(src0 + j);
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
+                vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
+                                   vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale),
+                                                         internal::vrecpq_f32(v_src1))), v_mask)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src0 = vld1_f32(src0 + j);
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
+                vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
+                                  vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale),
+                                                                internal::vrecp_f32(v_src1))), v_mask)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)scale;
+#endif
+}
+
+void reciprocal(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                u8 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const s8 * srcBase, ptrdiff_t srcStride,
+                s8 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const u16 * srcBase, ptrdiff_t srcStride,
+                u16 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const s16 * srcBase, ptrdiff_t srcStride,
+                s16 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const s32 * srcBase, ptrdiff_t srcStride,
+                s32 * dstBase, ptrdiff_t dstStride,
+                f32 scale,
+                CONVERT_POLICY cpolicy)
+{
+    recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
+}
+
+void reciprocal(const Size2D &size,
+                const f32 * srcBase, ptrdiff_t srcStride,
+                f32 * dstBase, ptrdiff_t dstStride,
+                f32 scale)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (scale == 0.0f)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
+            std::memset(dst, 0, sizeof(f32) * size.width);
+        }
+        return;
+    }
+
+    float32x4_t v_zero = vdupq_n_f32(0.0f);
+
+    size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
+    size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
+
+    if (std::fabs(scale - 1.0f) < FLT_EPSILON)
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
+                vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
+                                   vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
+                vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
+                                  vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? 1.0f / src1[j] : 0;
+            }
+        }
+    }
+    else
+    {
+        for (size_t i = 0; i < size.height; ++i)
+        {
+            const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
+            f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+            size_t j = 0;
+
+            for (; j < roiw128; j += 4)
+            {
+                internal::prefetch(src1 + j);
+
+                float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+                uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
+                vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
+                                   vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1),
+                                                                     scale)),v_mask)));
+            }
+
+            for (; j < roiw64; j += 2)
+            {
+                float32x2_t v_src1 = vld1_f32(src1 + j);
+
+                uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
+                vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
+                                  vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1),
+                                                                  scale)), v_mask)));
+            }
+
+            for (; j < size.width; j++)
+            {
+                dst[j] = src1[j] ? scale / src1[j] : 0;
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)scale;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/dot_product.cpp
+++ b/3rdparty/carotene/src/dot_product.cpp
@ -0,0 +1,260 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+f64 dotProduct(const Size2D &_size,
+               const u8 * src0Base, ptrdiff_t src0Stride,
+               const u8 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
+// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
+#define DOT_UINT_BLOCKSIZE 66050*8
+    f64 result = 0.0;
+    for (size_t row = 0; row < size.height; ++row)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
+
+        size_t i = 0;
+        uint64x2_t ws = vmovq_n_u64(0);
+
+        while(i + 16 <= size.width)
+        {
+            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
+
+            uint32x4_t s1 = vmovq_n_u32(0);
+            uint32x4_t s2 = vmovq_n_u32(0);
+
+            for (; i <= lim; i += 16)
+            {
+                internal::prefetch(src0 + i);
+                internal::prefetch(src1 + i);
+
+                uint8x16_t vs1 = vld1q_u8(src0 + i);
+                uint8x16_t vs2 = vld1q_u8(src1 + i);
+
+                uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
+                uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
+
+                s1 = vpadalq_u16(s1, vdot1);
+                s2 = vpadalq_u16(s2, vdot2);
+            }
+
+            ws = vpadalq_u32(ws, s1);
+            ws = vpadalq_u32(ws, s2);
+        }
+
+        if(i + 8 <= size.width)
+        {
+            uint8x8_t vs1 = vld1_u8(src0 + i);
+            uint8x8_t vs2 = vld1_u8(src1 + i);
+
+            ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
+            i += 8;
+        }
+
+        result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
+
+        for (; i < size.width; ++i)
+            result += s32(src0[i]) * s32(src1[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+f64 dotProduct(const Size2D &_size,
+               const s8 * src0Base, ptrdiff_t src0Stride,
+               const s8 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
+// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
+#define DOT_INT_BLOCKSIZE 131070*8
+    f64 result = 0.0;
+    for (size_t row = 0; row < size.height; ++row)
+    {
+        const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
+        const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
+
+        size_t i = 0;
+        int64x2_t ws = vmovq_n_s64(0);
+
+        while(i + 16 <= size.width)
+        {
+            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
+
+            int32x4_t s1 = vmovq_n_s32(0);
+            int32x4_t s2 = vmovq_n_s32(0);
+
+            for (; i <= lim; i += 16)
+            {
+                internal::prefetch(src0 + i);
+                internal::prefetch(src1 + i);
+
+                int8x16_t vs1 = vld1q_s8(src0 + i);
+                int8x16_t vs2 = vld1q_s8(src1 + i);
+
+                int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
+                int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
+
+                s1 = vpadalq_s16(s1, vdot1);
+                s2 = vpadalq_s16(s2, vdot2);
+            }
+
+            ws = vpadalq_s32(ws, s1);
+            ws = vpadalq_s32(ws, s2);
+        }
+
+        if(i + 8 <= size.width)
+        {
+            int8x8_t vs1 = vld1_s8(src0 + i);
+            int8x8_t vs2 = vld1_s8(src1 + i);
+
+            ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
+            i += 8;
+        }
+
+        result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
+
+        for (; i < size.width; ++i)
+            result += s32(src0[i]) * s32(src1[i]);
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+f64 dotProduct(const Size2D &_size,
+               const f32 * src0Base, ptrdiff_t src0Stride,
+               const f32 * src1Base, ptrdiff_t src1Stride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (src0Stride == src1Stride &&
+        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+#define DOT_FLOAT_BLOCKSIZE (1 << 13)
+    f64 result = 0.0;
+    for (size_t row = 0; row < size.height; ++row)
+    {
+        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
+        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
+
+        size_t i = 0;
+        while(i + 4 <= size.width)
+        {
+            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
+            float32x4_t v_sum = vdupq_n_f32(0.0f);
+
+            for( ; i <= lim; i += 4 )
+            {
+                internal::prefetch(src0 + i);
+                internal::prefetch(src1 + i);
+                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
+            }
+
+            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
+            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
+        }
+
+        if(i + 2 <= size.width)
+        {
+            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
+            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
+            i += 2;
+        }
+
+        for (; i < size.width; ++i)
+            result += src0[i] * src1[i];
+    }
+    return result;
+#else
+    (void)_size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+
+    return 0;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/fast.cpp
+++ b/3rdparty/carotene/src/fast.cpp
@ -0,0 +1,428 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+
+/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
+   Below is the original copyright and the references */
+
+/*
+Copyright (c) 2006, 2008 Edward Rosten
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ *Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+ *Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+ *Neither the name of the University of Cambridge nor the names of
+  its contributors may be used to endorse or promote products derived
+  from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+The references are:
+ * Machine learning for high-speed corner detection,
+   E. Rosten and T. Drummond, ECCV 2006
+ * Faster and better: A machine learning approach to corner detection
+   E. Rosten, R. Porter and T. Drummond, PAMI, 2009
+*/
+
+#include "common.hpp"
+
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+namespace
+{
+
+void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
+{
+    pixel[0] = 0 + row_stride * 3;
+    pixel[1] = 1 + row_stride * 3;
+    pixel[2] = 2 + row_stride * 2;
+    pixel[3] = 3 + row_stride * 1;
+    pixel[4] = 3 + row_stride * 0;
+    pixel[5] = 3 + row_stride * -1;
+    pixel[6] = 2 + row_stride * -2;
+    pixel[7] = 1 + row_stride * -3;
+    pixel[8] = 0 + row_stride * -3;
+    pixel[9] = -1 + row_stride * -3;
+    pixel[10] = -2 + row_stride * -2;
+    pixel[11] = -3 + row_stride * -1;
+    pixel[12] = -3 + row_stride * 0;
+    pixel[13] = -3 + row_stride * 1;
+    pixel[14] = -2 + row_stride * 2;
+    pixel[15] = -1 + row_stride * 3;
+}
+
+u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
+{
+    const s32 K = 8, N = 16 + K + 1;
+    s32 k, v = ptr[0];
+    s16 d[(N + 7) & ~7];
+    for( k = 0; k < N; k++ )
+        d[k] = (s16)(v - ptr[pixel[k]]);
+
+    int16x8_t q0 = vdupq_n_s16((s16)(-1000));
+    int16x8_t q1 = vdupq_n_s16((s16)(1000));
+
+    int16x8_t d0_7   = vld1q_s16(d +  0);
+    int16x8_t d8_15  = vld1q_s16(d +  8);
+    int16x8_t d16_23 = vld1q_s16(d + 16);
+    int16x8_t d24    = vld1q_s16(d + 24);
+
+    //k == 0
+    int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
+    int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
+    int16x8_t ak0 = vminq_s16(v0k0, v1k0);
+    int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
+
+    v0k0 = vextq_s16(d0_7, d8_15, 3);
+    ak0 = vminq_s16(ak0, v0k0);
+    bk0 = vmaxq_s16(bk0, v0k0);
+
+    v1k0 = vextq_s16(d0_7, d8_15, 4);
+    ak0 = vminq_s16(ak0, v1k0);
+    bk0 = vmaxq_s16(bk0, v1k0);
+
+    v0k0 = vextq_s16(d0_7, d8_15, 5);
+    ak0 = vminq_s16(ak0, v0k0);
+    bk0 = vmaxq_s16(bk0, v0k0);
+
+    v1k0 = vextq_s16(d0_7, d8_15, 6);
+    ak0 = vminq_s16(ak0, v1k0);
+    bk0 = vmaxq_s16(bk0, v1k0);
+
+    v0k0 = vextq_s16(d0_7, d8_15, 7);
+    ak0 = vminq_s16(ak0, v0k0);
+    bk0 = vmaxq_s16(bk0, v0k0);
+
+    ak0 = vminq_s16(ak0, d8_15);
+    bk0 = vmaxq_s16(bk0, d8_15);
+
+    q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
+    q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
+
+    v1k0 = vextq_s16(d8_15, d16_23, 1);
+    q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
+    q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
+
+    //k == 8
+    int16x8_t v0k8 = v1k0;
+    int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
+    int16x8_t ak8 = vminq_s16(v0k8, v1k8);
+    int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
+
+    v0k8 = vextq_s16(d8_15, d16_23, 3);
+    ak8 = vminq_s16(ak8, v0k8);
+    bk8 = vmaxq_s16(bk8, v0k8);
+
+    v1k8 = vextq_s16(d8_15, d16_23, 4);
+    ak8 = vminq_s16(ak8, v1k8);
+    bk8 = vmaxq_s16(bk8, v1k8);
+
+    v0k8 = vextq_s16(d8_15, d16_23, 5);
+    ak8 = vminq_s16(ak8, v0k8);
+    bk8 = vmaxq_s16(bk8, v0k8);
+
+    v1k8 = vextq_s16(d8_15, d16_23, 6);
+    ak8 = vminq_s16(ak8, v1k8);
+    bk8 = vmaxq_s16(bk8, v1k8);
+
+    v0k8 = vextq_s16(d8_15, d16_23, 7);
+    ak8 = vminq_s16(ak8, v0k8);
+    bk8 = vmaxq_s16(bk8, v0k8);
+
+    ak8 = vminq_s16(ak8, d16_23);
+    bk8 = vmaxq_s16(bk8, d16_23);
+
+    q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
+    q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
+
+    v1k8 = vextq_s16(d16_23, d24, 1);
+    q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
+    q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
+
+    //fin
+    int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
+    int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
+    int32x4_t q2w = vmovl_s16(q2);
+    int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
+    int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
+
+    return (u8)(vget_lane_s32(q8, 0) - 1);
+}
+
+} //namespace
+#endif
+
+void FAST(const Size2D &size,
+          u8 *srcBase, ptrdiff_t srcStride,
+          KeypointStore *keypoints,
+          u8 threshold, bool nonmax_suppression)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    //keypoints.clear();
+
+    const s32 K = 8, N = 16 + K + 1;
+    ptrdiff_t i, j, k, pixel[N];
+    makeOffsets(pixel, srcStride);
+    for(k = 16; k < N; k++)
+        pixel[k] = pixel[k - 16];
+
+    uint8x16_t delta = vdupq_n_u8(128);
+    uint8x16_t t = vdupq_n_u8(threshold);
+    uint8x16_t K16 = vdupq_n_u8((u8)K);
+
+    u8 threshold_tab[512];
+    for( i = -255; i <= 255; i++ )
+        threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
+
+    std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
+    u8* buf[3];
+    buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
+    ptrdiff_t* cpbuf[3];
+    cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
+    cpbuf[1] = cpbuf[0] + size.width + 1;
+    cpbuf[2] = cpbuf[1] + size.width + 1;
+    memset(buf[0], 0, size.width*3);
+
+    for(i = 3; i < (ptrdiff_t)size.height-2; i++)
+    {
+        const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
+        u8* curr = buf[(i - 3)%3];
+        ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
+        memset(curr, 0, size.width);
+        ptrdiff_t ncorners = 0;
+
+        if( i < (ptrdiff_t)size.height - 3 )
+        {
+            j = 3;
+
+            for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
+            {
+                internal::prefetch(ptr);
+                internal::prefetch(ptr + pixel[0]);
+                internal::prefetch(ptr + pixel[2]);
+
+                uint8x16_t v0 = vld1q_u8(ptr);
+                int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
+                int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
+
+                int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
+                int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
+                int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
+                int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
+
+                uint8x16_t m0 =   vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
+                uint8x16_t m1 =   vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
+                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
+                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
+                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
+                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
+                m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
+                m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
+                m0 = vorrq_u8(m0, m1);
+
+                u64 mask[2];
+                vst1q_u64(mask, vreinterpretq_u64_u8(m0));
+
+                if( mask[0] == 0 )
+                {
+                    if (mask[1] != 0)
+                    {
+                        j -= 8;
+                        ptr -= 8;
+                    }
+                    continue;
+                }
+
+                uint8x16_t c0 = vmovq_n_u8(0);
+                uint8x16_t c1 = vmovq_n_u8(0);
+                uint8x16_t max0 = vmovq_n_u8(0);
+                uint8x16_t max1 = vmovq_n_u8(0);
+                for( k = 0; k < N; k++ )
+                {
+                    int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
+                    m0 = vcgtq_s8(x, v2);
+                    m1 = vcgtq_s8(v1, x);
+
+                    c0 = vandq_u8(vsubq_u8(c0, m0), m0);
+                    c1 = vandq_u8(vsubq_u8(c1, m1), m1);
+
+                    max0 = vmaxq_u8(max0, c0);
+                    max1 = vmaxq_u8(max1, c1);
+                }
+
+                max0 = vmaxq_u8(max0, max1);
+                u8 m[16];
+                vst1q_u8(m, vcgtq_u8(max0, K16));
+
+                for( k = 0; k < 16; ++k )
+                    if(m[k])
+                    {
+                        cornerpos[ncorners++] = j+k;
+                        if(nonmax_suppression)
+                            curr[j+k] = cornerScore(ptr+k, pixel);
+                    }
+            }
+
+            for( ; j < (s32)size.width - 3; j++, ptr++ )
+            {
+                s32 v = ptr[0];
+                const u8* tab = &threshold_tab[0] - v + 255;
+                s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
+
+                if( d == 0 )
+                    continue;
+
+                d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
+                d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
+                d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
+
+                if( d == 0 )
+                    continue;
+
+                d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
+                d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
+                d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
+                d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
+
+                if( d & 1 )
+                {
+                    s32 vt = v - threshold, count = 0;
+
+                    for( k = 0; k < N; k++ )
+                    {
+                        s32 x = ptr[pixel[k]];
+                        if(x < vt)
+                        {
+                            if( ++count > K )
+                            {
+                                cornerpos[ncorners++] = j;
+                                if(nonmax_suppression)
+                                    curr[j] = cornerScore(ptr, pixel);
+                                break;
+                            }
+                        }
+                        else
+                            count = 0;
+                    }
+                }
+
+                if( d & 2 )
+                {
+                    s32 vt = v + threshold, count = 0;
+
+                    for( k = 0; k < N; k++ )
+                    {
+                        s32 x = ptr[pixel[k]];
+                        if(x > vt)
+                        {
+                            if( ++count > K )
+                            {
+                                cornerpos[ncorners++] = j;
+                                if(nonmax_suppression)
+                                    curr[j] = cornerScore(ptr, pixel);
+                                break;
+                            }
+                        }
+                        else
+                            count = 0;
+                    }
+                }
+            }
+        }
+
+        cornerpos[-1] = ncorners;
+
+        if( i == 3 )
+            continue;
+
+        const u8* prev = buf[(i - 4 + 3)%3];
+        const u8* pprev = buf[(i - 5 + 3)%3];
+        cornerpos = cpbuf[(i - 4 + 3)%3];
+        ncorners = cornerpos[-1];
+
+        for( k = 0; k < ncorners; k++ )
+        {
+            j = cornerpos[k];
+            s32 score = prev[j];
+            if( !nonmax_suppression ||
+                    (score > prev[j+1] && score > prev[j-1] &&
+                     score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
+                     score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
+            {
+                keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)keypoints;
+    (void)threshold;
+    (void)nonmax_suppression;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/fill_minmaxloc.cpp
+++ b/3rdparty/carotene/src/fill_minmaxloc.cpp
@ -0,0 +1,442 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+void process(const T * src, size_t j0, size_t j1, size_t i,
+             T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+             T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    for (size_t j = j0; j < j1; ++j)
+    {
+        T val = src[j];
+
+        if (val == maxVal)
+        {
+            if (maxLocCount < maxLocCapacity)
+            {
+                maxLocPtr[maxLocCount] = j;
+                maxLocPtr[maxLocCount + 1] = i;
+            }
+            maxLocCount += 2;
+        }
+
+        if (val == minVal)
+        {
+            if (minLocCount < minLocCapacity)
+            {
+                minLocPtr[minLocCount] = j;
+                minLocPtr[minLocCount + 1] = i;
+            }
+            minLocCount += 2;
+        }
+    }
+}
+
+} // namespace
+
+#endif
+
+void fillMinMaxLocs(const Size2D & size,
+                    const u8 * srcBase, ptrdiff_t srcStride,
+                    u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
+    uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
+
+    u64 mask[2] = { 0ul };
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint8x16_t v_src = vld1q_u8(src + j);
+
+            uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
+            uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
+            uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
+
+            vst1q_u8((u8 *)&mask[0], v_mask);
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+            if (mask[1])
+                process(src, j + 8, j + 16, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+        for ( ; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src = vld1_u8(src + j);
+
+            uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
+            uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
+            uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
+
+            vst1_u8((u8 *)&mask[0], v_mask);
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const u16 * srcBase, ptrdiff_t srcStride,
+                    u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
+               v_minval8 = vdupq_n_u16(minVal);
+    u64 mask[2] = { 0ul };
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
+
+            uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
+            uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
+
+            vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+            if (mask[1])
+                process(src, j + 8, j + 16, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            uint16x8_t v_src = vld1q_u16(src + j);
+
+            uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
+            uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
+            uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
+
+            vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const s16 * srcBase, ptrdiff_t srcStride,
+                    s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
+              v_minval8 = vdupq_n_s16(minVal);
+    u64 mask[2] = { 0ul };
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw16; j += 16)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
+
+            uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
+            uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
+
+            vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+            if (mask[1])
+                process(src, j + 8, j + 16, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int16x8_t v_src = vld1q_s16(src + j);
+
+            uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
+            uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
+            uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
+
+            vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
+
+            if (mask[0])
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const s32 * srcBase, ptrdiff_t srcStride,
+                    s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
+              v_minval4 = vdupq_n_s32(minVal);
+    u64 mask = 0ul;
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
+
+            uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
+            uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
+
+            vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
+
+            if (mask)
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+void fillMinMaxLocs(const Size2D & size,
+                    const u32 * srcBase, ptrdiff_t srcStride,
+                    u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
+                    u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
+               v_minval4 = vdupq_n_u32(minVal);
+    u64 mask = 0ul;
+
+    minLocCapacity <<= 1;
+    maxLocCapacity <<= 1;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0;
+
+        for ( ; j < roiw8; j += 8)
+        {
+            internal::prefetch(src + j);
+            uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
+
+            uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
+            uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
+
+            vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
+
+            if (mask)
+                process(src, j, j + 8, i,
+                        minVal, minLocPtr, minLocCount, minLocCapacity,
+                        maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+        }
+
+        process(src, j, size.width, i,
+                minVal, minLocPtr, minLocCount, minLocCapacity,
+                maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
+    }
+
+    minLocCount >>= 1;
+    maxLocCount >>= 1;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)minVal;
+    (void)minLocPtr;
+    (void)minLocCount;
+    (void)minLocCapacity;
+    (void)maxVal;
+    (void)maxLocPtr;
+    (void)maxLocCount;
+    (void)maxLocCapacity;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/flip.cpp
+++ b/3rdparty/carotene/src/flip.cpp
@ -0,0 +1,222 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
+{
+    bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
+    return isSupportedConfiguration() &&
+            ((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
+             (flipMode == FLIP_VERTICAL_MODE));
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+void flip(const Size2D & size,
+          const void * srcBase, ptrdiff_t srcStride,
+          void * dstBase, ptrdiff_t dstStride,
+          FLIP_MODE flipMode)
+{
+    using namespace internal;
+
+    typedef typename VecTraits<T>::vec128 vec128;
+    typedef typename VecTraits<T>::vec64 vec64;
+
+    u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src = getRowPtr((const T *)srcBase, srcStride, i);
+        T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
+        size_t js = 0, jd = size.width;
+
+        for (; js < roiw_base; js += step_base, jd -= step_base)
+        {
+            prefetch(src + js);
+
+            vec128 v_src = vld1q(src + js);
+            vec128 v_dst = vrev64q(v_src);
+            v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
+            vst1q(dst + jd - step_base, v_dst);
+        }
+        for (; js < roiw_tail; js += step_tail, jd -= step_tail)
+        {
+            vec64 v_src = vld1(src + js);
+            vst1(dst + jd - step_tail, vrev64(v_src));
+        }
+
+        for (--jd; js < size.width; ++js, --jd)
+            dst[jd] = src[js];
+    }
+}
+
+template <typename T>
+void flip3(const Size2D & size,
+           const void * srcBase, ptrdiff_t srcStride,
+           void * dstBase, ptrdiff_t dstStride,
+           FLIP_MODE flipMode)
+{
+    using namespace internal;
+
+#ifndef ANDROID
+    typedef typename VecTraits<T, 3>::vec128 vec128;
+#endif
+    typedef typename VecTraits<T, 3>::vec64 vec64;
+
+#ifndef ANDROID
+    u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+#endif
+    u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const T * src = getRowPtr((const T *)srcBase, srcStride, i);
+        T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
+        size_t j = 0, js = 0, jd = size.width * 3;
+
+#ifndef ANDROID
+        for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
+        {
+            prefetch(src + js);
+
+            vec128 v_src = vld3q(src + js), v_dst;
+            v_src.val[0] = vrev64q(v_src.val[0]);
+            v_src.val[1] = vrev64q(v_src.val[1]);
+            v_src.val[2] = vrev64q(v_src.val[2]);
+
+            v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
+            v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
+            v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));
+
+            vst3q(dst + jd - step_base3, v_dst);
+        }
+#endif // ANDROID
+
+        for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
+        {
+            vec64 v_src = vld3(src + js), v_dst;
+            v_dst.val[0] = vrev64(v_src.val[0]);
+            v_dst.val[1] = vrev64(v_src.val[1]);
+            v_dst.val[2] = vrev64(v_src.val[2]);
+
+            vst3(dst + jd - step_tail3, v_dst);
+        }
+
+        for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
+        {
+            dst[jd] = src[js];
+            dst[jd + 1] = src[js + 1];
+            dst[jd + 2] = src[js + 2];
+        }
+    }
+}
+
+typedef void (* flipFunc)(const Size2D &size,
+                  const void * srcBase, ptrdiff_t srcStride,
+                  void * dstBase, ptrdiff_t dstStride,
+                  FLIP_MODE flipMode);
+
+} // namespace
+
+#endif
+
+void flip(const Size2D &size,
+          const u8 * srcBase, ptrdiff_t srcStride,
+          u8 * dstBase, ptrdiff_t dstStride,
+          FLIP_MODE flipMode, u32 elemSize)
+{
+    internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
+#ifdef CAROTENE_NEON
+
+    if (flipMode == FLIP_VERTICAL_MODE)
+    {
+        for (size_t y = 0; y < size.height; ++y)
+        {
+            const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
+            u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);
+
+            std::memcpy(dst_row, src_row, elemSize * size.width);
+        }
+        return;
+    }
+
+    flipFunc func = NULL;
+
+    if (elemSize == (u32)sizeof(u8))
+        func = &flip<u8>;
+    if (elemSize == (u32)sizeof(u16))
+        func = &flip<u16>;
+    if (elemSize == (u32)sizeof(u32))
+        func = &flip<u32>;
+    if (elemSize == (u32)sizeof(u8) * 3)
+        func = &flip3<u8>;
+
+    if (func == NULL)
+        return;
+
+    func(size,
+         srcBase, srcStride,
+         dstBase, dstStride,
+         flipMode);
+
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)flipMode;
+    (void)elemSize;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/gaussian_blur.cpp
+++ b/3rdparty/carotene/src/gaussian_blur.cpp
--- a/3rdparty/carotene/src/in_range.cpp
+++ b/3rdparty/carotene/src/in_range.cpp
@ -0,0 +1,195 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
+inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
+inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
+
+template <typename T, int elsize> struct vtail
+{
+    static inline void inRange(const T *, const T *, const T *,
+                               u8 *, size_t &, size_t)
+    {
+        //do nothing since there couldn't be enough data
+    }
+};
+template <typename T> struct vtail<T, 2>
+{
+    static inline void inRange(const T * src, const T * rng1, const T * rng2,
+                               u8 * dst, size_t &x, size_t width)
+    {
+        typedef typename internal::VecTraits<T>::vec128 vec128;
+        typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
+        //There no more than 15 elements in the tail, so we could handle 8 element vector only once
+        if( x + 8 < width)
+        {
+             vec128  vs = internal::vld1q( src + x);
+             vec128 vr1 = internal::vld1q(rng1 + x);
+             vec128 vr2 = internal::vld1q(rng2 + x);
+            uvec128  vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+            internal::vst1(dst + x, internal::vmovn(vd));
+            x+=8;
+        }
+    }
+};
+template <typename T> struct vtail<T, 1>
+{
+    static inline void inRange(const T * src, const T * rng1, const T * rng2,
+                               u8 * dst, size_t &x, size_t width)
+    {
+        typedef typename internal::VecTraits<T>::vec128 vec128;
+        typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
+        typedef typename internal::VecTraits<T>::vec64 vec64;
+        typedef typename internal::VecTraits<T>::unsign::vec64 uvec64;
+        //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
+        if( x + 16 < width)
+        {
+             vec128  vs = internal::vld1q( src + x);
+             vec128 vr1 = internal::vld1q(rng1 + x);
+             vec128 vr2 = internal::vld1q(rng2 + x);
+            uvec128  vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+            internal::vst1q(dst + x, vd);
+            x+=16;
+        }
+        if( x + 8 < width)
+        {
+             vec64  vs = internal::vld1( src + x);
+             vec64 vr1 = internal::vld1(rng1 + x);
+             vec64 vr2 = internal::vld1(rng2 + x);
+            uvec64  vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs));
+            internal::vst1(dst + x, vd);
+            x+=8;
+        }
+    }
+};
+
+template <typename T>
+inline void inRangeCheck(const Size2D &_size,
+                         const T * srcBase, ptrdiff_t srcStride,
+                         const T * rng1Base, ptrdiff_t rng1Stride,
+                         const T * rng2Base, ptrdiff_t rng2Stride,
+                         u8 * dstBase, ptrdiff_t dstStride)
+{
+    typedef typename internal::VecTraits<T>::vec128 vec128;
+    typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
+
+    Size2D size(_size);
+    if (srcStride == dstStride &&
+        srcStride == rng1Stride &&
+        srcStride == rng2Stride &&
+        srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    const size_t width = size.width & ~( 32/sizeof(T) - 1 );
+
+    for(size_t j = 0; j < size.height; ++j)
+    {
+        const T *  src = internal::getRowPtr( srcBase,  srcStride, j);
+        const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j);
+        const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j);
+             u8 *  dst = internal::getRowPtr( dstBase,  dstStride, j);
+        size_t i = 0;
+        for( ; i < width; i += 32/sizeof(T) )
+        {
+            internal::prefetch(src + i);
+            internal::prefetch(rng1 + i);
+            internal::prefetch(rng2 + i);
+
+             vec128  vs = internal::vld1q( src + i);
+             vec128 vr1 = internal::vld1q(rng1 + i);
+             vec128 vr2 = internal::vld1q(rng2 + i);
+            uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+                     vs = internal::vld1q( src + i + 16/sizeof(T));
+                    vr1 = internal::vld1q(rng1 + i + 16/sizeof(T));
+                    vr2 = internal::vld1q(rng2 + i + 16/sizeof(T));
+            uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
+            vnst(dst + i, vd1, vd2);
+        }
+        vtail<T, sizeof(T)>::inRange(src, rng1, rng2, dst, i, size.width);
+        for( ; i < size.width; i++ )
+            dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i]));
+    }
+}
+
+}
+
+#define INRANGEFUNC(T)                                       \
+void inRange(const Size2D &_size,                            \
+             const T * srcBase, ptrdiff_t srcStride,         \
+             const T * rng1Base, ptrdiff_t rng1Stride,       \
+             const T * rng2Base, ptrdiff_t rng2Stride,       \
+             u8 * dstBase, ptrdiff_t dstStride)              \
+{                                                            \
+    internal::assertSupportedConfiguration();                \
+    inRangeCheck(_size, srcBase, srcStride,                  \
+                 rng1Base, rng1Stride, rng2Base, rng2Stride, \
+                 dstBase, dstStride);                        \
+}
+#else
+#define INRANGEFUNC(T)                                       \
+void inRange(const Size2D &,                                 \
+             const T *, ptrdiff_t,                           \
+             const T *, ptrdiff_t,                           \
+             const T *, ptrdiff_t,                           \
+             u8 *, ptrdiff_t)                                \
+{                                                            \
+    internal::assertSupportedConfiguration();                \
+}
+#endif
+
+INRANGEFUNC(u8)
+INRANGEFUNC(s8)
+INRANGEFUNC(u16)
+INRANGEFUNC(s16)
+INRANGEFUNC(s32)
+INRANGEFUNC(f32)
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/integral.cpp
+++ b/3rdparty/carotene/src/integral.cpp
@ -0,0 +1,238 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+void integral(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u32 * sumBase, ptrdiff_t sumStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint32x4_t v_zero = vmovq_n_u32(0u);
+
+    // the first iteration
+    const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
+    u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
+
+    uint32x4_t prev = v_zero;
+    size_t j = 0u;
+
+    for ( ; j + 7 < size.width; j += 8)
+    {
+        internal::prefetch(sum + j);
+        internal::prefetch(src + j);
+
+        uint8x8_t el8shr0 = vld1_u8(src + j);
+        uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
+        uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
+        uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
+
+        uint16x8_t el8shr12 =  vaddl_u8(el8shr1, el8shr2);
+        uint16x8_t el8shr03 =  vaddl_u8(el8shr0, el8shr3);
+
+        uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
+        uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
+
+        uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
+        uint32x4_t vsumh = vaddw_u16(prev, el4h);
+
+        vst1q_u32(sum + j, vsuml);
+        vst1q_u32(sum + j + 4, vsumh);
+
+        prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
+    }
+
+    for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
+        sum[j] = (v += src[j]);
+
+    // the others
+    for (size_t i = 1; i < size.height ; ++i)
+    {
+        src = internal::getRowPtr(srcBase, srcStride, i);
+        u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
+        sum = internal::getRowPtr(sumBase, sumStride, i);
+
+        prev = v_zero;
+        j = 0u;
+
+        for ( ; j + 7 < size.width; j += 8)
+        {
+            internal::prefetch(sum + j);
+            internal::prefetch(src + j);
+
+            uint32x4_t vsuml = vld1q_u32(prevSum + j);
+            uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
+
+            uint8x8_t el8shr0 = vld1_u8(src + j);
+            uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
+            uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
+            uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
+
+            vsuml = vaddq_u32(vsuml, prev);
+            vsumh = vaddq_u32(vsumh, prev);
+
+            uint16x8_t el8shr12 =  vaddl_u8(el8shr1, el8shr2);
+            uint16x8_t el8shr03 =  vaddl_u8(el8shr0, el8shr3);
+
+            uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
+            uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
+
+            vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
+            vsumh = vaddw_u16(vsumh, el4h);
+
+            vst1q_u32(sum + j, vsuml);
+            vst1q_u32(sum + j + 4, vsumh);
+
+            prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
+        }
+
+        for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
+            sum[j] = (v += src[j]) + prevSum[j];
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sumBase;
+    (void)sumStride;
+#endif
+}
+
+void sqrIntegral(const Size2D &size,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 f64 * sqsumBase, ptrdiff_t sqsumStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    uint16x8_t v_zero8 = vmovq_n_u16(0u);
+
+    // the first iteration
+    const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
+    f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
+
+    double prev = 0.;
+    size_t j = 0u;
+
+    for ( ; j + 7 < size.width; j += 8)
+    {
+        internal::prefetch(sqsum + j);
+        internal::prefetch(src + j);
+
+        uint8x8_t vsrc = vld1_u8(src + j);
+
+        uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
+        uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
+
+        uint32x4_t el8shr01l =  vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
+        uint32x4_t el8shr01h =  vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
+
+        uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
+
+        uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
+        uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
+        uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
+
+        u32 buf[8];
+        vst1_u32(buf, vget_low_u32(el8shr01l));
+        vst1_u32(buf+2, el2l);
+        vst1_u32(buf+4, el2hl);
+        vst1_u32(buf+6, el2hh);
+        for(u32 k=0; k < 8; k++)
+            sqsum[j+k] = prev + buf[k];
+        prev += buf[7];
+    }
+
+    for (; j < size.width; ++j)
+        sqsum[j] = (prev += src[j]*src[j]);
+
+    // the others
+    for (size_t i = 1; i < size.height ; ++i)
+    {
+        src = internal::getRowPtr(srcBase, srcStride, i);
+        f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
+        sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
+
+        prev = 0.;
+        j = 0u;
+
+        for ( ; j + 7 < size.width; j += 8)
+        {
+            internal::prefetch(sqsum + j);
+            internal::prefetch(src + j);
+
+            uint8x8_t vsrc = vld1_u8(src + j);
+
+            uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
+            uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
+
+            uint32x4_t el8shr01l =  vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
+            uint32x4_t el8shr01h =  vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
+
+            uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
+
+            uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
+            uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
+            uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
+
+            u32 buf[8];
+            vst1_u32(buf, vget_low_u32(el8shr01l));
+            vst1_u32(buf+2, el2l);
+            vst1_u32(buf+4, el2hl);
+            vst1_u32(buf+6, el2hh);
+            for(u32 k=0; k < 8; k++)
+                sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
+            prev += buf[7];
+        }
+
+        for (; j < size.width; ++j)
+            sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sqsumBase;
+    (void)sqsumStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/intrinsics.hpp
+++ b/3rdparty/carotene/src/intrinsics.hpp
@ -0,0 +1,112 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_INTRINSICS_HPP
+#define CAROTENE_INTRINSICS_HPP
+
+#include <carotene/definitions.hpp>
+
+#include <arm_neon.h>
+
+namespace CAROTENE_NS { namespace internal {
+
+/////////////// Custom NEON intrinsics ///////////////////
+
+// calculate reciprocal value
+
+inline float32x4_t vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+inline float32x2_t vrecp_f32(float32x2_t val)
+{
+    float32x2_t reciprocal = vrecpe_f32(val);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+// caclulate sqrt value
+
+inline float32x4_t vrsqrtq_f32(float32x4_t val)
+{
+    float32x4_t e = vrsqrteq_f32(val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x2_t vrsqrt_f32(float32x2_t val)
+{
+    float32x2_t e = vrsqrte_f32(val);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x4_t vsqrtq_f32(float32x4_t val)
+{
+    return vrecpq_f32(vrsqrtq_f32(val));
+}
+
+inline float32x2_t vsqrt_f32(float32x2_t val)
+{
+    return vrecp_f32(vrsqrt_f32(val));
+}
+
+// table lookup with the table in a 128-bit register
+
+inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
+{
+#ifdef __aarch64__
+    // AArch64 supports this natively
+    return ::vqtbl1_u8(a, b);
+#else
+    union { uint8x16_t v; uint8x8x2_t w; } u = { a };
+    return vtbl2_u8(u.w, b);
+#endif
+}
+
+} }
+
+#endif
--- a/3rdparty/carotene/src/laplacian.cpp
+++ b/3rdparty/carotene/src/laplacian.cpp
@ -0,0 +1,713 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+
+#include <vector>
+
+namespace CAROTENE_NS {
+
+bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 8 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+void Laplacian3x3(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase, ptrdiff_t dstStride,
+                  BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
+    const uint16x8_t v_zero = vdupq_n_u16(0);
+    const uint8x8_t v_border = vdup_n_u8(borderValue);
+
+    uint8x8_t vsub;
+    uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
+    uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
+
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        s16 prevx = 0, currx = 0, nextx = 0;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 8)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
+            uint8x8_t x1 = vld1_u8(srow1 + x);
+            uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 8 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx = borderValue;
+                else
+                    prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
+
+                currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+
+            // and calculate next value
+            tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr = v_border_x3;
+                else if (border == BORDER_MODE_REPLICATE)
+                    tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
+
+                vsub = x1;
+
+                continue;
+            }
+
+            // combine 3 "shifted" vectors
+            t0 = vextq_u16(tprev, tcurr, 7);
+            t1 = tcurr;
+            t2 = vextq_u16(tcurr, tnext, 1);
+
+            // and add them
+            t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
+
+            int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
+                                      vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
+            uint8x8_t it0 = vqmovun_s16(tt0);
+            vst1_u8(drow + x - 8, it0);
+
+            vsub = x1;
+        }
+
+        x -= 8;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                    nextx = borderValue * 3;
+                else if (border == BORDER_MODE_REPLICATE)
+                    nextx = srow2[x] + srow1[x] + srow0[x];
+            }
+            else
+            {
+                nextx = (srow2 ? srow2[x + 1] : borderValue) +
+                                 srow1[x + 1] +
+                        (srow0 ? srow0[x + 1] : borderValue);
+            }
+
+            s32 val = (prevx + currx + nextx) - 9 * srow1[x];
+            drow[x] = internal::saturate_cast<u8>((s32)val);
+
+            // make shift
+            prevx = currx;
+            currx = nextx;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() &&
+        size.width >= 8 && size.height >= 1 &&
+        (border == BORDER_MODE_CONSTANT   ||
+         border == BORDER_MODE_REFLECT    ||
+         border == BORDER_MODE_REFLECT101 ||
+         border == BORDER_MODE_REPLICATE);
+}
+
+void Laplacian1OpenCV(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
+#ifdef CAROTENE_NEON
+    ptrdiff_t rows = size.height, cols = size.width;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (border == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(cols + 4,borderValue);
+        tmp = &_tmp[2];
+    }
+
+    for( ptrdiff_t y = 0; y < rows; y++ )
+    {
+        const u8* v0 = 0;
+        const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* v2 = 0;
+        // make border
+        if (border == BORDER_MODE_REFLECT101) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
+        } else  if (border == BORDER_MODE_CONSTANT) {
+            v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            v2 =  y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+        } else {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+        }
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        int16x8_t tcurr = vmovq_n_s16(0x0);
+        int16x8_t tnext = vmovq_n_s16(0x0);
+        int16x8_t t0, t2;
+        uint8x8_t xx0 = vmov_n_u8(0x0);
+        uint8x8_t xx1 = vmov_n_u8(0x0);
+        uint8x8_t xx2 = vmov_n_u8(0x0);
+        ptrdiff_t x = 0;
+        const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(v0 + x);
+            internal::prefetch(v1 + x);
+            internal::prefetch(v2 + x);
+
+            uint8x8_t x0 = vld1_u8(v0 + x);
+            uint8x8_t x1 = vld1_u8(v1 + x);
+            uint8x8_t x2 = vld1_u8(v2 + x);
+
+            if(x) {
+                xx0 = xx1;
+                xx1 = xx2;
+            } else {
+                xx1 = x1;
+                // make border
+                    if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+                    {
+                        xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
+                    }
+                    else if (border == BORDER_MODE_CONSTANT)
+                    {
+                        xx1 = vset_lane_u8(borderValue, x1, 7);
+                    }
+                    else if (border == BORDER_MODE_REFLECT101)
+                    {
+                        xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
+                    }
+            }
+            xx2 = x1;
+
+            if(x) {
+                tcurr = tnext;
+            }
+            tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
+                              vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
+
+            if(!x) {
+                tcurr = tnext;
+                continue;
+            }
+            t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
+            t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
+            t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
+
+            vst1q_s16(drow + x - 8, t0);
+        }
+
+        x -= 8;
+        if(x == cols){
+            x--;
+        }
+
+        for( ; x < cols; x++ )
+        {
+            s16 nextx;
+            s16 prevx;
+            // make border
+            if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+            {
+                prevx = x == 0 ? v1[0] : v1[x-1];
+                nextx = x == cols-1 ? v1[x] : v1[x+1];
+            }
+            else if (border == BORDER_MODE_REFLECT101)
+            {
+                prevx = x == 0 ? v1[1] : v1[x-1];
+                nextx = x == cols-1 ? v1[x-1] : v1[x+1];
+            }
+            else //if (border == BORDER_MODE_CONSTANT)
+            {
+                prevx = x == 0 ? borderValue : v1[x-1];
+                nextx = x == cols-1 ? borderValue : v1[x+1];
+            }
+            *(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+void Laplacian3OpenCV(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
+#ifdef CAROTENE_NEON
+    ptrdiff_t rows = size.height, cols = size.width;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (border == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(cols + 4,borderValue);
+        tmp = &_tmp[2];
+    }
+
+    for( ptrdiff_t y = 0; y < rows; y++ )
+    {
+        const u8* v0 = 0;
+        const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* v2 = 0;
+        // make border
+        if (border == BORDER_MODE_REFLECT101) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
+        } else  if (border == BORDER_MODE_CONSTANT) {
+            v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+        } else {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+        }
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        int16x8_t tprev = vmovq_n_s16(0x0);
+        int16x8_t tcurr = vmovq_n_s16(0x0);
+        int16x8_t tnext = vmovq_n_s16(0x0);
+        int16x8_t tc = vmovq_n_s16(0x0);
+        int16x8_t t0, t2, tcnext;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(v0 + x);
+            internal::prefetch(v1 + x);
+            internal::prefetch(v2 + x);
+
+            uint8x8_t x0 = vld1_u8(v0 + x);
+            uint8x8_t x1 = vld1_u8(v1 + x);
+            uint8x8_t x2 = vld1_u8(v2 + x);
+            tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
+
+            if(x) {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+            tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
+
+            if(!x) {
+                tcurr = tnext;
+                tc = tcnext;
+
+                // make border
+                    if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+                    {
+                        tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
+                    }
+                    else if (border == BORDER_MODE_CONSTANT)
+                    {
+                        tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
+                    }
+                    else if (border == BORDER_MODE_REFLECT101)
+                    {
+                        tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
+                    }
+                continue;
+            }
+
+            t0 = vextq_s16(tprev, tcurr, 7);
+            t2 = vextq_s16(tcurr, tnext, 1);
+
+            t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
+            tc = tcnext;
+
+            t0 = vshlq_n_s16(t0, 1);
+            vst1q_s16(drow + x - 8, t0);
+        }
+        x -= 8;
+        if(x == cols){
+            x--;
+        }
+
+        for( ; x < cols; x++ )
+        {
+            s16 nextx, nextx2;
+            s16 prevx, prevx2;
+            // make border
+            if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
+            {
+                prevx = x == 0 ? v0[0] : v0[x-1];
+                prevx2 = x == 0 ? v2[0] : v2[x-1];
+                nextx = x == cols-1 ? v0[x] : v0[x+1];
+                nextx2 = x == cols-1 ? v2[x] : v2[x+1];
+            }
+            else if (border == BORDER_MODE_REFLECT101)
+            {
+                prevx = x == 0 ? v0[1] : v0[x-1];
+                prevx2 = x == 0 ? v2[1] : v2[x-1];
+                nextx = x == cols-1 ? v0[x-1] : v0[x+1];
+                nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
+            }
+            else //if (border == BORDER_MODE_CONSTANT)
+            {
+                prevx = x == 0 ? borderValue : v0[x-1];
+                prevx2 = x == 0 ? borderValue : v2[x-1];
+                nextx = x == cols-1 ? borderValue : v0[x+1];
+                nextx2 = x == cols-1 ? borderValue : v2[x+1];
+            }
+            s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
+            *(drow+x) = 2*res;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+void Laplacian5OpenCV(const Size2D &size,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      s16 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
+#ifdef CAROTENE_NEON
+    ptrdiff_t rows = size.height, cols = size.width;
+
+    std::vector<u8> _tmp;
+    u8 *tmp = 0;
+    if (border == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(cols + 4,borderValue);
+        tmp = &_tmp[2];
+    }
+
+    for( ptrdiff_t y = 0; y < rows; y++ )
+    {
+        const u8* v0 = 0;
+        const u8* v1 = 0;
+        const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* v3 = 0;
+        const u8* v4 = 0;
+        // make border
+        if (border == BORDER_MODE_REPLICATE) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
+            v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+            v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
+        } else if (border == BORDER_MODE_REFLECT) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
+            v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
+            v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
+        } else if (border == BORDER_MODE_REFLECT101) {
+            v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
+            v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
+            v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
+            v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1   rows - 4 + (2,1)
+        } else if (border == BORDER_MODE_CONSTANT) {
+            v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
+            v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+            v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
+        }
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        int16x8_t tnext, tc, t0;
+        int16x8_t tnext2, tnext3;
+        int16x8_t tnext1Old, tnext2Old, tnext3Old;
+        int16x8_t tnext4OldOldOld, tnext5OldOldOld;
+
+        int16x8_t tcurr1 = vmovq_n_s16(0x0);
+        int16x8_t tnext1 = vmovq_n_s16(0x0);
+        int16x8_t tprev1 = vmovq_n_s16(0x0);
+        int16x8_t tpprev1 = vmovq_n_s16(0x0);
+        int16x8_t tppprev1 = vmovq_n_s16(0x0);
+
+        int16x8_t tnext4Old = vmovq_n_s16(0x0);
+        int16x8_t tnext5Old = vmovq_n_s16(0x0);
+        int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
+        int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
+
+        // do vertical convolution
+        ptrdiff_t x = 0;
+        const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
+        for( ; x <= bcols; x += 8 )
+        {
+            internal::prefetch(v0 + x);
+            internal::prefetch(v1 + x);
+            internal::prefetch(v2 + x);
+            internal::prefetch(v3 + x);
+            internal::prefetch(v4 + x);
+
+            uint8x8_t x0 = vld1_u8(v0 + x);
+            uint8x8_t x1 = vld1_u8(v1 + x);
+            uint8x8_t x2 = vld1_u8(v2 + x);
+            uint8x8_t x3 = vld1_u8(v3 + x);
+            uint8x8_t x4 = vld1_u8(v4 + x);
+            if(x) {
+                tcurr1 = tnext1;
+            }
+
+            tnext4OldOldOld = tnext4Old;
+            tnext5OldOldOld = tnext5Old;
+            tnext1Old = tnext1OldOld;
+            tnext2Old = tnext2OldOld;
+            tnext3Old = tnext3OldOld;
+            tnext4Old = tnext4OldOld;
+            tnext5Old = tnext5OldOld;
+
+            tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
+            tnext3 = vshlq_n_s16(tnext3, 1);
+
+            tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
+            tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
+            tnext2 = vsubq_s16(tc, tnext);
+
+            tnext1 = vaddq_s16(tnext3, tnext2);
+            // tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
+
+            tnext2 = vshlq_n_s16(tnext2, 1);
+            // tnext2 = 2*x4 - 4*x2 + 2*x0
+
+            tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
+            // tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3  + 2*x4
+
+            tnext1OldOld = tnext1;
+            tnext2OldOld = tnext2;
+            tnext3OldOld = tnext3;
+            tnext4OldOld = tnext2;
+            tnext5OldOld = tnext1;
+
+            if(x) {
+                tnext1 = vextq_s16(tnext1Old, tnext1, 2);
+                tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
+                tprev1 = tnext3Old;
+
+                if(x!=8) {
+                    tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
+                    tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
+                }
+            }
+
+            if(!x) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
+                } else if (border == BORDER_MODE_REFLECT) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
+                    tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    tpprev1 = vextq_s16(tnext2, tnext2, 7);
+                    tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
+
+                    tprev1 = vextq_s16(tnext1, tnext1, 6);
+                    tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
+                    tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
+                }
+                tppprev1 = tprev1;
+                continue;
+            }
+
+            t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
+            t0 = vaddq_s16(t0, t0);
+            vst1q_s16(drow + x - 8, t0);
+        }
+        x -= 8;
+        if(x >= cols - 1)
+            x = cols-2;
+
+        s16 pprevx = 0;
+        s16 prevx = 0;
+        s16 nextx = 0;
+        s16 nnextx = 0;
+
+        for( ; x < cols; x++ )
+        {
+            if (x == 0) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE) {
+                    pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
+                    prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
+                } else if (border == BORDER_MODE_REFLECT) {
+                    pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
+                    prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
+                    prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    pprevx = 8 * borderValue;
+                    prevx = 0;
+                }
+            } else if (x == 1) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
+                    pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    pprevx = 8 * borderValue;
+                }
+                prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
+            } else {
+                pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
+                prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
+            }
+            s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
+            if (x == cols-1) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE) {
+                    nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
+                    nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
+                } else if (border == BORDER_MODE_REFLECT) {
+                    nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
+                    nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
+                    nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    nextx = 0;
+                    nnextx = 8 * borderValue;
+                }
+            } else if (x == cols-2) {
+                // make border
+                if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
+                    nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
+                } else if (border == BORDER_MODE_REFLECT101) {
+                    nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
+                } else if (border == BORDER_MODE_CONSTANT) {
+                    nnextx = 8 * borderValue;
+                }
+                nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
+            } else {
+                nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
+                nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
+            }
+            s16 res = pprevx + prevx + currx + nextx + nnextx;
+            *(drow+x) = 2*res;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/magnitude.cpp
+++ b/3rdparty/carotene/src/magnitude.cpp
@ -0,0 +1,160 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+#include <cmath>
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+struct Magnitude
+{
+    typedef s16 type;
+
+    void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1,
+              int16x8_t & v_dst) const
+    {
+        int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1);
+        float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
+                                       vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
+        v_src0_p = vget_high_s16(v_src0);
+        v_src1_p = vget_high_s16(v_src1);
+        float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
+                                       vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
+
+        int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0));
+        int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1));
+
+        v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1));
+    }
+
+    void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1,
+              int16x4_t & v_dst) const
+    {
+        float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)),
+                                      vcvtq_f32_s32(vmull_s16(v_src1, v_src1)));
+        int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp));
+        v_dst = vqmovn_s32(v_sqrt);
+    }
+
+    void operator() (const short * src0, const short * src1, short * dst) const
+    {
+        f32 src0val = (f32)src0[0], src1val = (f32)src1[0];
+        dst[0] = internal::saturate_cast<s16>((s32)sqrtf(src0val * src0val + src1val * src1val));
+    }
+};
+
+struct MagnitudeF32
+{
+    typedef f32 type;
+
+    void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1,
+              float32x4_t & v_dst) const
+    {
+        v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1)));
+    }
+
+    void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1,
+              float32x2_t & v_dst) const
+    {
+        v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1)));
+    }
+
+    void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
+    {
+        dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]);
+    }
+};
+
+} // namespace
+
+#endif
+
+void magnitude(const Size2D &size,
+               const s16 * src0Base, ptrdiff_t src0Stride,
+               const s16 * src1Base, ptrdiff_t src1Stride,
+               s16 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         Magnitude());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void magnitude(const Size2D &size,
+               const f32 * src0Base, ptrdiff_t src0Stride,
+               const f32 * src1Base, ptrdiff_t src1Stride,
+               f32 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         MagnitudeF32());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/meanstddev.cpp
+++ b/3rdparty/carotene/src/meanstddev.cpp
@ -0,0 +1,163 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <cmath>
+
+namespace CAROTENE_NS {
+
+void meanStdDev(const Size2D &size,
+                const u8 * srcBase, ptrdiff_t srcStride,
+                f32 * pMean, f32 * pStdDev)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    f64 fsum = 0.0f, fsqsum = 0.0f;
+    sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);
+
+    // calc mean and stddev
+    f64 itotal = 1.0 / size.total();
+    f64 mean = fsum * itotal;
+    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
+
+    if (pMean)
+        *pMean = mean;
+    if (pStdDev)
+        *pStdDev = stddev;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMean;
+    (void)pStdDev;
+#endif
+}
+
+void meanStdDev(const Size2D &size,
+                const u16 * srcBase, ptrdiff_t srcStride,
+                f32 * pMean, f32 * pStdDev)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
+    f64 fsum = 0.0f, fsqsum = 0.0f;
+
+    f32 arsum[8];
+    uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
+    float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0u;
+
+        while (j < roiw4)
+        {
+            size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
+            v_sum = v_zero;
+            v_sqsum = v_zero_f;
+
+            for ( ; j + 16 < blockSize ; j += 16)
+            {
+                internal::prefetch(src + j);
+                uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
+
+                // 0
+                uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
+                uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
+                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
+                float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
+                float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
+
+                // 1
+                v_srclo = vmovl_u16(vget_low_u16(v_src1));
+                v_srchi = vmovl_u16(vget_high_u16(v_src1));
+                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
+                v_srclo_f = vcvtq_f32_u32(v_srclo);
+                v_srchi_f = vcvtq_f32_u32(v_srchi);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
+                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
+            }
+
+            for ( ; j < blockSize; j += 4)
+            {
+                uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
+                float32x4_t v_src_f = vcvtq_f32_u32(v_src);
+                v_sum = vaddq_u32(v_sum, v_src);
+                v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
+            }
+
+            vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
+            vst1q_f32(arsum + 4, v_sqsum);
+
+            fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
+            fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
+        }
+
+        // collect a few last elements in the current row
+        for ( ; j < size.width; ++j)
+        {
+            f32 srcval = src[j];
+            fsum += srcval;
+            fsqsum += srcval * srcval;
+        }
+    }
+
+    // calc mean and stddev
+    f64 itotal = 1.0 / size.total();
+    f64 mean = fsum * itotal;
+    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
+
+    if (pMean)
+        *pMean = mean;
+    if (pStdDev)
+        *pStdDev = stddev;
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)pMean;
+    (void)pStdDev;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/median_filter.cpp
+++ b/3rdparty/carotene/src/median_filter.cpp
@ -0,0 +1,227 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+/*
+ * The code here is based on the code in
+ * <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
+ * See also <http://ndevilla.free.fr/median/median/index.html>.
+ */
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+namespace {
+
+    uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn)
+    {
+        u8 buf[16+8];
+        vst1q_u8(buf+cn, r);
+        for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i];
+        return vld1q_u8(buf);
+    }
+
+    uint8x8_t getRightReplicate(uint8x8_t r, u32 cn)
+    {
+        u8 buf[8+8];
+        vst1_u8(buf, r);
+        for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i];
+        return vld1_u8(buf+cn);
+    }
+
+} // namespace
+
+//o------^-------^-----------------------------o 0
+//       |       |
+//o--^---v---^---|-------^---------------------o 1
+//   |       |   |       |
+//o--v-------v---|-------|-^-------^-------^---o 2
+//               |       | |       |       |
+//o------^-------v-----^-|-|-------|-------|---o 3
+//       |             | | |       |       |
+//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
+//   |       |     |   |   |   |       |
+//o--v-------v---^-|---|---v---|-------|-------o 5
+//               | |   |       |       |
+//o------^-------|-|---v-------|-------v-------o 6
+//       |       | |           |
+//o--^---v---^---|-v-----------v---------------o 7
+//   |       |   |
+//o--v-------v---v-----------------------------o 8
+
+#define ELT(num, level) v ## num ## _lv ## level
+#define PIX_SORT(a, alvl, b, blvl, newlvl) \
+    PIX_MIN(a, alvl, b, blvl, newlvl); \
+    PIX_MAX(a, alvl, b, blvl, newlvl);
+
+#define SORT9 \
+    PIX_SORT(1, 00, 2, 00, 01); \
+    PIX_SORT(4, 00, 5, 00, 02); \
+    PIX_SORT(7, 00, 8, 00, 03); \
+    PIX_SORT(0, 00, 1, 01, 04); \
+    PIX_SORT(3, 00, 4, 02, 05); \
+    PIX_SORT(6, 00, 7, 03, 06); \
+    PIX_SORT(1, 04, 2, 01, 07); \
+    PIX_SORT(4, 05, 5, 02, 08); \
+    PIX_SORT(7, 06, 8, 03, 09); \
+    PIX_MAX (0, 04, 3, 05, 10); \
+    PIX_MIN (5, 08, 8, 09, 11); \
+    PIX_SORT(4, 08, 7, 09, 12); \
+    PIX_MAX (3, 10, 6, 06, 13); \
+    PIX_MAX (1, 07, 4, 12, 14); \
+    PIX_MIN (2, 07, 5, 11, 15); \
+    PIX_MIN (4, 14, 7, 12, 16); \
+    PIX_SORT(4, 16, 2, 15, 17); \
+    PIX_MAX (6, 13, 4, 17, 18); \
+    PIX_MIN (4, 18, 2, 17, 19);
+
+#endif
+
+bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels)
+{
+    return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8;
+}
+
+void medianFilter3x3(const Size2D &size, u32 numChannels,
+                     const u8 *srcBase, ptrdiff_t srcStride,
+                     const Margin &srcMargin,
+                     u8 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels));
+#ifdef CAROTENE_NEON
+    u32 cn = numChannels;
+    size_t colsn = size.width * cn;
+
+    for (size_t i = 0; i < size.height; ++i) {
+        const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i);
+        const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride;
+        const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride;
+        u8* pdst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        {
+            uint8x16_t v3_lv00 = vld1q_u8(psrc0);
+            uint8x16_t v4_lv00 = vld1q_u8(psrc1);
+            uint8x16_t v5_lv00 = vld1q_u8(psrc2);
+            uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn);
+            uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn);
+            uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn);
+            uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn);
+            uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn);
+            uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn);
+
+            goto medianBlur3x3_mainBody;
+
+            for (; j < colsn - 16; j += 16) {
+                internal::prefetch(psrc0 + j);
+                internal::prefetch(psrc1 + j);
+                internal::prefetch(psrc2 + j);
+
+                v0_lv00 = vld1q_u8(psrc0 + j - cn);
+                v1_lv00 = vld1q_u8(psrc1 + j - cn);
+                v2_lv00 = vld1q_u8(psrc2 + j - cn);
+                v3_lv00 = vld1q_u8(psrc0 + j);
+                v4_lv00 = vld1q_u8(psrc1 + j);
+                v5_lv00 = vld1q_u8(psrc2 + j);
+                v6_lv00 = vld1q_u8(psrc0 + j + cn);
+                v7_lv00 = vld1q_u8(psrc1 + j + cn);
+                v8_lv00 = vld1q_u8(psrc2 + j + cn);
+
+medianBlur3x3_mainBody:
+
+#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
+#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
+                SORT9;
+#undef PIX_MAX
+#undef PIX_MIN
+
+                vst1q_u8(pdst + j, v4_lv19);
+            }
+        }
+
+        {
+            size_t k = colsn - 8;
+            uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn);
+            uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn);
+            uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn);
+            uint8x8_t v3_lv00 = vld1_u8(psrc0 + k);
+            uint8x8_t v4_lv00 = vld1_u8(psrc1 + k);
+            uint8x8_t v5_lv00 = vld1_u8(psrc2 + k);
+            uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn);
+            uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn);
+            uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn);
+
+            goto medianBlur3x3_tailBody;
+
+            for (; k >= j - 8; k -= 8) {
+                v0_lv00 = vld1_u8(psrc0 + k - cn);
+                v1_lv00 = vld1_u8(psrc1 + k - cn);
+                v2_lv00 = vld1_u8(psrc2 + k - cn);
+                v3_lv00 = vld1_u8(psrc0 + k);
+                v4_lv00 = vld1_u8(psrc1 + k);
+                v5_lv00 = vld1_u8(psrc2 + k);
+                v6_lv00 = vld1_u8(psrc0 + k + cn);
+                v7_lv00 = vld1_u8(psrc1 + k + cn);
+                v8_lv00 = vld1_u8(psrc2 + k + cn);
+
+medianBlur3x3_tailBody:
+
+#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
+#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
+                SORT9;
+#undef PIX_MAX
+#undef PIX_MIN
+
+                vst1_u8(pdst + k, v4_lv19);
+            }
+        }
+    }
+#else
+    (void)size;
+    (void)numChannels;
+    (void)srcBase;
+    (void)srcStride;
+    (void)srcMargin;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/min_max.cpp
+++ b/3rdparty/carotene/src/min_max.cpp
@ -0,0 +1,139 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <algorithm>
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T>
+struct Min
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vminq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vmin(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = std::min(src0[0], src1[0]);
+    }
+};
+
+template <typename T>
+struct Max
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vmaxq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vmax(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = std::max(src0[0], src1[0]);
+    }
+};
+
+} // namespace
+
+#define IMPL_OP(fun, op, type)                                         \
+void fun(const Size2D &size,                                           \
+         const type * src0Base, ptrdiff_t src0Stride,                  \
+         const type * src1Base, ptrdiff_t src1Stride,                  \
+         type * dstBase, ptrdiff_t dstStride)                          \
+{                                                                      \
+    internal::assertSupportedConfiguration();                          \
+    internal::vtransform(size,                                         \
+                         src0Base, src0Stride,                         \
+                         src1Base, src1Stride,                         \
+                         dstBase, dstStride, op<type>());              \
+}
+
+#else
+
+#define IMPL_OP(fun, op, type)                    \
+void fun(const Size2D &,                          \
+         const type *, ptrdiff_t,                 \
+         const type *, ptrdiff_t,                 \
+         type *, ptrdiff_t)                       \
+{                                                 \
+    internal::assertSupportedConfiguration();     \
+}
+
+#endif
+
+#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
+
+IMPL_MINMAX(u8)
+IMPL_MINMAX(s8)
+IMPL_MINMAX(u16)
+IMPL_MINMAX(s16)
+IMPL_MINMAX(u32)
+IMPL_MINMAX(s32)
+IMPL_MINMAX(f32)
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/minmaxloc.cpp
+++ b/3rdparty/carotene/src/minmaxloc.cpp
--- a/3rdparty/carotene/src/morph.cpp
+++ b/3rdparty/carotene/src/morph.cpp
@ -0,0 +1,728 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border)
+{
+    return isSupportedConfiguration() && size.width >= 16 &&
+        (border == BORDER_MODE_CONSTANT ||
+            border == BORDER_MODE_REPLICATE);
+}
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+struct ErodeVecOp
+{
+    ErodeVecOp():borderValue(0){}
+
+    ErodeVecOp(BORDER_MODE border, u8 borderValue_) :
+        borderValue(borderValue_)
+    {
+        if (border == BORDER_MODE_REPLICATE)
+            borderValue = std::numeric_limits<u8>::max();
+    }
+
+    inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
+    {
+        return vminq_u8(a, b);
+    }
+
+    inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
+    {
+        return vmin_u8(a, b);
+    }
+
+    inline u8 operator()(u8 a, u8 b) const
+    {
+        return std::min(a, b);
+    }
+
+    u8 borderValue;
+};
+
+struct DilateVecOp
+{
+    DilateVecOp():borderValue(0){}
+
+    DilateVecOp(BORDER_MODE border, u8 borderValue_) :
+        borderValue(borderValue_)
+    {
+        if (border == BORDER_MODE_REPLICATE)
+            borderValue = std::numeric_limits<u8>::min();
+    }
+
+    inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
+    {
+        return vmaxq_u8(a, b);
+    }
+
+    inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
+    {
+        return vmax_u8(a, b);
+    }
+
+    inline u8 operator()(u8 a, u8 b) const
+    {
+        return std::max(a, b);
+    }
+
+    u8 borderValue;
+};
+
+template <typename VecOp>
+void morph3x3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              BORDER_MODE border, const VecOp & vop)
+{
+    u8 borderValue = vop.borderValue;
+    ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
+
+    const uint8x16_t v_zero = vdupq_n_u8(0);
+    const uint8x16_t v_border = vdupq_n_u8(borderValue);
+
+    uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
+    uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
+
+    for (ptrdiff_t y = 0; y < height; ++y)
+    {
+        const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
+        const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
+        u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        u8 prevx = 0, currx = 0, nextx = 0;
+        ptrdiff_t x = 0;
+        const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16);
+
+        // perform vertical convolution
+        for ( ; x <= bwidth; x += 16)
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x);
+            uint8x16_t x1 = vld1q_u8(srow1 + x);
+            uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x);
+
+            // calculate values for plain CPU part below if needed
+            if (x + 16 >= bwidth)
+            {
+                ptrdiff_t x3 = x == width ? width - 1 : x;
+                ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
+
+                if (border == BORDER_MODE_CONSTANT && x4 < 0)
+                    prevx = borderValue;
+                else
+                    prevx = vop(srow1[x4],
+                                vop(srow2 ? srow2[x4] : borderValue,
+                                    srow0 ? srow0[x4] : borderValue));
+
+                currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue));
+            }
+
+            // make shift
+            if (x)
+            {
+                tprev = tcurr;
+                tcurr = tnext;
+            }
+
+            // and calculate next value
+            tnext = vop(vop(x0, x1), x2);
+
+            // make extrapolation for the first elements
+            if (!x)
+            {
+                // make border
+                if (border == BORDER_MODE_CONSTANT)
+                    tcurr = v_border;
+                else if (border == BORDER_MODE_REPLICATE)
+                    tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0));
+
+                continue;
+            }
+
+            // combine 3 "shifted" vectors
+            t0 = vextq_u8(tprev, tcurr, 15);
+            t1 = tcurr;
+            t2 = vextq_u8(tcurr, tnext, 1);
+
+            // and add them
+            t0 = vop(t0, vop(t1, t2));
+
+            vst1q_u8(drow + x - 16, t0);
+        }
+
+        x -= 16;
+        if (x == width)
+            --x;
+
+        for ( ; x < width; ++x)
+        {
+            // make extrapolation for the last elements
+            if (x + 1 >= width)
+            {
+                if (border == BORDER_MODE_CONSTANT)
+                    nextx = borderValue;
+                else if (border == BORDER_MODE_REPLICATE)
+                    nextx = vop(srow2[x], vop(srow1[x], srow0[x]));
+            }
+            else
+                nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue,
+                                srow0 ? srow0[x + 1] : borderValue),
+                            srow1[x + 1]);
+
+            drow[x] = vop(prevx, vop(currx, nextx));
+
+            // make shift
+            prevx = currx;
+            currx = nextx;
+        }
+    }
+}
+
+} // namespace
+
+#endif
+
+void erode3x3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              u8 * dstBase, ptrdiff_t dstStride,
+              BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    morph3x3(size,
+             srcBase, srcStride,
+             dstBase, dstStride,
+             border, ErodeVecOp(border, borderValue));
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+void dilate3x3(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               u8 * dstBase, ptrdiff_t dstStride,
+               BORDER_MODE border, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
+#ifdef CAROTENE_NEON
+    morph3x3(size,
+             srcBase, srcStride,
+             dstBase, dstStride,
+             border, DilateVecOp(border, borderValue));
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)border;
+    (void)borderValue;
+#endif
+}
+
+#ifdef CAROTENE_NEON
+namespace {
+
+template<class VecUpdate>
+void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize)
+{
+    size_t i, j, k;
+    size_t width16 = (width & -16) * cn;
+    size_t width8 = (width & -8) * cn;
+    width *= cn;
+
+    if (ksize == 1)
+    {
+        for (i = 0; i < width; i++)
+            dst[i] = src[i];
+        return;
+    }
+
+    ksize = ksize*cn;
+    VecUpdate updateOp;
+    switch(cn)
+    {
+    case 1:
+        for (i = 0; i < width16; i += 16)
+        {
+            const u8* sptr = src + i;
+            uint8x16_t s = vld1q_u8(sptr);
+            internal::prefetch(sptr);
+
+            for( k = 1; k < ksize; ++k)
+                s = updateOp(s, vld1q_u8(sptr + k));
+
+            vst1q_u8(dst + i, s);
+        }
+
+        for (; i < width8; i += 8)
+        {
+            const u8* sptr = src + i;
+            uint8x8_t s = vld1_u8(sptr);
+            internal::prefetch(sptr);
+
+            for( k = 1; k < ksize; ++k)
+                s = updateOp(s, vld1_u8(sptr + k));
+
+            vst1_u8(dst + i, s);
+        }
+        break;
+    default:
+        for (i = 0; i < width16; i += 16)
+        {
+            uint8x16_t s = vld1q_u8(src + i);
+            internal::prefetch(src + i);
+
+            for (k = cn; k < ksize; k += cn)
+                s = updateOp(s, vld1q_u8(src + i + k));
+
+            vst1q_u8(dst + i, s);
+        }
+
+        for (; i < width8; i += 8)
+        {
+            uint8x8_t s = vld1_u8(src + i);
+            internal::prefetch(src + i);
+
+            for (k = cn; k < ksize; k += cn)
+                s = updateOp(s, vld1_u8(src + i + k));
+
+            vst1_u8(dst + i, s);
+        }
+        break;
+    }
+
+    ptrdiff_t i0 = i;
+    for( k = 0; k < (size_t)cn; k++, src++, dst++ )
+    {
+        for( i = i0; i <= width - cn*2; i += cn*2 )
+        {
+            const u8* s = src + i;
+            u8 m = s[cn];
+            for( j = cn*2; j < ksize; j += cn )
+                m = updateOp(m, s[j]);
+            dst[i] = updateOp(m, s[0]);
+            dst[i+cn] = updateOp(m, s[j]);
+        }
+
+        for( ; i < width; i += cn )
+        {
+            const u8* s = src + i;
+            u8 m = s[0];
+            for( j = cn; j < ksize; j += cn )
+                m = updateOp(m, s[j]);
+            dst[i] = m;
+        }
+    }
+}
+
+template<class VecUpdate>
+void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize)
+{
+    size_t i, k;
+    size_t width32 = width & -32;
+    VecUpdate updateOp;
+
+    uint8x16_t x0,x1,s0,s1;
+    if (ksize == 3)
+    {
+        for (; count > 1; count -= 2, dst += dststep * 2, src += 2)
+        {
+            for (i = 0; i < width32; i += 32)
+            {
+                const u8* sptr = src[1] + i;
+                s0 = vld1q_u8(sptr);
+                s1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                sptr = src[2] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                s0 = updateOp(s0, x0);
+                s1 = updateOp(s1, x1);
+
+                sptr = src[0] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                vst1q_u8(dst+i, updateOp(s0, x0));
+                vst1q_u8(dst+i+16, updateOp(s1, x1));
+
+                sptr = src[3] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                vst1q_u8(dst + dststep + i, updateOp(s0, x0));
+                vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
+
+            }
+            for(; i < width; i++ )
+            {
+                u8 s = src[1][i];
+
+                for( k = 2; k < ksize; k++ )
+                    s = updateOp(s, src[k][i]);
+
+                dst[i] = updateOp(s, src[0][i]);
+                dst[i+dststep] = updateOp(s, src[k][i]);
+            }
+        }
+    }
+    else if (ksize > 1)
+        for (; count > 1; count -= 2, dst += dststep*2, src += 2)
+        {
+            for (i = 0; i < width32; i += 32)
+            {
+                const u8* sptr = src[1] + i;
+                s0 = vld1q_u8(sptr);
+                s1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                for (k = 2; k < ksize; k++)
+                {
+                    sptr = src[k] + i;
+                    x0 = vld1q_u8(sptr);
+                    x1 = vld1q_u8(sptr + 16);
+                    internal::prefetch(sptr);
+
+                    s0 = updateOp(s0, x0);
+                    s1 = updateOp(s1, x1);
+                }
+
+                sptr = src[0] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+
+                vst1q_u8(dst+i, updateOp(s0, x0));
+                vst1q_u8(dst+i+16, updateOp(s1, x1));
+
+                sptr = src[k] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                vst1q_u8(dst + dststep + i, updateOp(s0, x0));
+                vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
+            }
+            for(; i < width; i++ )
+            {
+                u8 s = src[1][i];
+
+                for( k = 2; k < ksize; k++ )
+                    s = updateOp(s, src[k][i]);
+
+                dst[i] = updateOp(s, src[0][i]);
+                dst[i+dststep] = updateOp(s, src[k][i]);
+            }
+        }
+
+    for (; count > 0; count--, dst += dststep, src++)
+    {
+        for (i = 0; i < width32; i += 32)
+        {
+            const u8* sptr = src[0] + i;
+            s0 = vld1q_u8(sptr);
+            s1 = vld1q_u8(sptr + 16);
+            internal::prefetch(sptr);
+
+            for (k = 1; k < ksize; k++)
+            {
+                sptr = src[k] + i;
+                x0 = vld1q_u8(sptr);
+                x1 = vld1q_u8(sptr + 16);
+                internal::prefetch(sptr);
+                s0 = updateOp(s0, x0);
+                s1 = updateOp(s1, x1);
+            }
+
+            vst1q_u8(dst + i, s0);
+            vst1q_u8(dst + i + 16, s1);
+        }
+        for(; i < width; i++ )
+        {
+            u8 s = src[0][i];
+            for( k = 1; k < ksize; k++ )
+                s = updateOp(s, src[k][i]);
+            dst[i] = s;
+        }
+    }
+}
+
+template <class Op>
+inline void morphology(const Size2D &ssize, u32 cn,
+                       const u8 * srcBase, ptrdiff_t srcStride,
+                       u8 * dstBase, ptrdiff_t dstStride,
+                       const Size2D &ksize,
+                       size_t anchorX, size_t anchorY,
+                       BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+                       const u8 * borderValues, Margin borderMargin)
+{
+    //Temporary buffers common for all iterations
+    std::vector<u8> _srcRow(cn*(ssize.width + ksize.width - 1));
+    u8* srcRow = &_srcRow[0];
+
+    size_t bufRows = std::max<size_t>(ksize.height + 3, std::max<size_t>(anchorY, ksize.height-anchorY-1)*2+1);
+    std::vector<u8*> _rows(bufRows);
+    u8** rows = &_rows[0];
+
+    // adjust swidthcn so that the used part of buffers stays compact in memory
+    ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size)
+    std::vector<u8> _ringBuf(swidthcn*bufRows+16);
+    u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16);
+
+    size_t borderLength = std::max<size_t>(ksize.width - 1, 1) * cn;
+    std::vector<ptrdiff_t> _borderTab(borderLength);
+    ptrdiff_t * borderTab = &_borderTab[0];
+
+    std::vector<u8> _constBorderValue;
+    std::vector<u8> _constBorderRow;
+    u8 * constBorderValue = NULL;
+    u8 * constBorderRow = NULL;
+    if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT )
+    {
+        _constBorderValue.resize(borderLength);
+        constBorderValue = &_constBorderValue[0];
+        size_t i;
+        for(i = 0; i < cn; i++)
+            constBorderValue[i] = borderValues[i];
+        for(; i < borderLength; i++)
+            constBorderValue[i] = constBorderValue[i-cn];
+
+        if( columnBorderType == BORDER_MODE_CONSTANT )
+        {
+            _constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16));
+            constBorderRow = internal::alignPtr(&_constBorderRow[0], 16);
+            size_t N = (ssize.width + ksize.width - 1)*cn;
+            for( i = 0; i < N; i += borderLength )
+            {
+                size_t n = std::min( borderLength, N - i );
+                for(size_t j = 0; j < n; j++)
+                    srcRow[i+j] = constBorderValue[j];
+            }
+            MorphRow<Op>(srcRow, constBorderRow, ssize.width, cn, ksize.width);
+        }
+    }
+
+    Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right,
+                     ssize.height + borderMargin.top + borderMargin.bottom);
+
+    ptrdiff_t dx1 = std::max<ptrdiff_t>(anchorX - (ptrdiff_t)borderMargin.left, 0);
+    ptrdiff_t dx2 = std::max<ptrdiff_t>((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0);
+    // recompute border tables
+    if( dx1 > 0 || dx2 > 0 )
+    {
+        if( rowBorderType == BORDER_MODE_CONSTANT )
+        {
+            memcpy( srcRow, &constBorderValue[0], dx1*cn );
+            memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn );
+        }
+        else
+        {
+            ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX) - borderMargin.left;
+
+            ptrdiff_t wholeWidth = wholeSize.width;
+
+            ptrdiff_t i, j;
+            for( i = 0; i < dx1; i++ )
+            {
+                ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn;
+                for( j = 0; j < (ptrdiff_t)cn; j++ )
+                    borderTab[i*cn + j] = p0 + j;
+            }
+
+            for( i = 0; i < dx2; i++ )
+            {
+                ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn;
+                for( j = 0; j < (ptrdiff_t)cn; j++ )
+                    borderTab[(i + dx1)*cn + j] = p0 + j;
+            }
+        }
+    }
+
+    ptrdiff_t startY, startY0, endY, rowCount;
+    startY = startY0 = std::max<ptrdiff_t>(borderMargin.top - anchorY, 0);
+    endY = std::min<ptrdiff_t>(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height);
+
+    const u8* src = srcBase + (startY - borderMargin.top)*srcStride;
+    u8* dst = dstBase;
+
+    ptrdiff_t width = ssize.width, kwidth = ksize.width;
+    ptrdiff_t kheight = ksize.height, ay = anchorY;
+    ptrdiff_t width1 = ssize.width + kwidth - 1;
+    ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX);
+    bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT;
+    ptrdiff_t dy = 0, i = 0;
+
+    src -= xofs1*cn;
+    ptrdiff_t count = endY - startY;
+
+    rowCount = 0;
+    for(;; dst += dstStride*i, dy += i)
+    {
+        ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top;
+        dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
+        dcount = std::min(dcount, count);
+        count -= dcount;
+        for( ; dcount-- > 0; src += srcStride )
+        {
+            ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows;
+            u8* brow = ringBuf + bi*swidthcn;
+
+            if( (size_t)(++rowCount) > bufRows )
+            {
+                --rowCount;
+                ++startY;
+            }
+
+            memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn );
+
+            if( makeBorder )
+            {
+                    for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ )
+                        srcRow[i] = src[borderTab[i]];
+                    for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ )
+                        srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]];
+            }
+
+            MorphRow<Op>(srcRow, brow, width, cn, ksize.width);
+        }
+
+        ptrdiff_t max_i = std::min<ptrdiff_t>(bufRows, ssize.height - dy + (kheight - 1));
+        for( i = 0; i < max_i; i++ )
+        {
+            ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay,
+                                               wholeSize.height, columnBorderType);
+            if( srcY < 0 ) // can happen only with constant border type
+                rows[i] = constBorderRow;
+            else
+            {
+                if( srcY >= startY + rowCount )
+                    break;
+                ptrdiff_t bi = (srcY - startY0) % bufRows;
+                rows[i] = ringBuf + bi*swidthcn;
+            }
+        }
+        if( i < kheight )
+            break;
+        i -= kheight - 1;
+        MorphColumn<Op>((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height);
+    }
+}
+
+} // namespace
+#endif // CAROTENE_NEON
+
+void erode(const Size2D &ssize, u32 cn,
+           const u8 * srcBase, ptrdiff_t srcStride,
+           u8 * dstBase, ptrdiff_t dstStride,
+           const Size2D &ksize,
+           size_t anchorX, size_t anchorY,
+           BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+           const u8 * borderValues, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
+                                           anchorX < ksize.width && anchorY < ksize.height);
+#ifdef CAROTENE_NEON
+    morphology<ErodeVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
+                           ksize, anchorX, anchorY, rowBorderType, columnBorderType,
+                           borderValues, borderMargin);
+#else
+    (void)cn;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)rowBorderType;
+    (void)columnBorderType;
+    (void)borderValues;
+    (void)borderMargin;
+#endif
+}
+
+void dilate(const Size2D &ssize, u32 cn,
+            const u8 * srcBase, ptrdiff_t srcStride,
+            u8 * dstBase, ptrdiff_t dstStride,
+            const Size2D &ksize,
+            size_t anchorX, size_t anchorY,
+            BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
+            const u8 * borderValues, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
+                                           anchorX < ksize.width && anchorY < ksize.height);
+#ifdef CAROTENE_NEON
+    morphology<DilateVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
+                            ksize, anchorX, anchorY, rowBorderType, columnBorderType,
+                            borderValues, borderMargin);
+#else
+    (void)cn;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)rowBorderType;
+    (void)columnBorderType;
+    (void)borderValues;
+    (void)borderMargin;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/mul.cpp
+++ b/3rdparty/carotene/src/mul.cpp
--- a/3rdparty/carotene/src/norm.cpp
+++ b/3rdparty/carotene/src/norm.cpp
--- a/3rdparty/carotene/src/opticalflow.cpp
+++ b/3rdparty/carotene/src/opticalflow.cpp
@ -0,0 +1,539 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "saturate_cast.hpp"
+#include <vector>
+#include <float.h> // For FLT_EPSILON
+
+namespace CAROTENE_NS {
+
+#define CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
+
+/*
+ *        Pyramidal Lucas-Kanade Optical Flow level processing
+ */
+void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
+                       const u8 *prevData, ptrdiff_t prevStride,
+                       const s16 *prevDerivData, ptrdiff_t prevDerivStride,
+                       const u8 *nextData, ptrdiff_t nextStride,
+                       u32 ptCount,
+                       const f32 *prevPts, f32 *nextPts,
+                       u8 *status, f32 *err,
+                       const Size2D &winSize,
+                       u32 terminationCount, f64 terminationEpsilon,
+                       u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
+                       f32 minEigThreshold)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f;
+    s32 cn2 = cn*2;
+
+    std::vector<s16> _buf(winSize.total()*(cn + cn2));
+    s16* IWinBuf = &_buf[0];
+    s32  IWinBufStride = winSize.width*cn;
+    s16* derivIWinBuf = &_buf[winSize.total()*cn];
+    s32  derivIWinBufStride = winSize.width*cn2;
+
+    for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
+    {
+        f32 levscale = (1./(1 << level));
+        u32 ptref = ptidx << 1;
+        f32 prevPtX = prevPts[ptref+0]*levscale;
+        f32 prevPtY = prevPts[ptref+1]*levscale;
+        f32 nextPtX;
+        f32 nextPtY;
+        if( level == maxLevel )
+        {
+            if( useInitialFlow )
+            {
+                nextPtX = nextPts[ptref+0]*levscale;
+                nextPtY = nextPts[ptref+1]*levscale;
+            }
+            else
+            {
+                nextPtX = prevPtX;
+                nextPtY = prevPtY;
+            }
+        }
+        else
+        {
+            nextPtX = nextPts[ptref+0]*2.f;
+            nextPtY = nextPts[ptref+1]*2.f;
+        }
+        nextPts[ptref+0] = nextPtX;
+        nextPts[ptref+1] = nextPtY;
+
+        s32 iprevPtX, iprevPtY;
+        s32 inextPtX, inextPtY;
+        prevPtX -= halfWinX;
+        prevPtY -= halfWinY;
+        iprevPtX = floor(prevPtX);
+        iprevPtY = floor(prevPtY);
+
+        if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
+            iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
+        {
+            if( level == 0 )
+            {
+                if( status )
+                    status[ptidx] = false;
+                if( err )
+                    err[ptidx] = 0;
+            }
+            continue;
+        }
+
+        f32 a = prevPtX - iprevPtX;
+        f32 b = prevPtY - iprevPtY;
+        const s32 W_BITS = 14, W_BITS1 = 14;
+        const f32 FLT_SCALE = 1.f/(1 << 20);
+        s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
+        s32 iw01 = round(a*(1.f - b)*(1 << W_BITS));
+        s32 iw10 = round((1.f - a)*b*(1 << W_BITS));
+        s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+        s32 dstep = prevDerivStride/sizeof(s16);
+        f32 A11 = 0, A12 = 0, A22 = 0;
+
+        int16x4_t viw00 = vmov_n_s16((s16)iw00);
+        int16x4_t viw01 = vmov_n_s16((s16)iw01);
+        int16x4_t viw10 = vmov_n_s16((s16)iw10);
+        int16x4_t viw11 = vmov_n_s16((s16)iw11);
+
+        float32x4_t vA11 = vmovq_n_f32(0);
+        float32x4_t vA12 = vmovq_n_f32(0);
+        float32x4_t vA22 = vmovq_n_f32(0);
+
+        s32 wwcn = winSize.width*cn;
+
+        // extract the patch from the first image, compute covariation matrix of derivatives
+        s32 x = 0;
+        for(s32 y = 0; y < (s32)winSize.height; y++ )
+        {
+            const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn;
+            const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2;
+
+            s16* Iptr = IWinBuf + y*IWinBufStride;
+            s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
+
+            internal::prefetch(src + x + prevStride * 2, 0);
+            for(x = 0; x <= wwcn - 8; x += 8)
+            {
+                uint8x8_t vsrc00 = vld1_u8(src + x);
+                uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
+                uint8x8_t vsrc01 = vld1_u8(src + x + cn);
+                uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
+
+                int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00));
+                int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10));
+                int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01));
+                int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11));
+
+                int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
+                int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
+
+                vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
+                vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
+
+                vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
+                vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
+
+                vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
+                vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
+
+                int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
+                int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
+
+                vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh));
+            }
+            for(; x <= wwcn - 4; x += 4)
+            {
+                uint8x8_t vsrc00 = vld1_u8(src + x);
+                uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
+                uint8x8_t vsrc01 = vld1_u8(src + x + cn);
+                uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
+
+                int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00)));
+                int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10)));
+                int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01)));
+                int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11)));
+
+                int32x4_t vsuml1 = vmull_s16(vs00, viw00);
+                int32x4_t vsuml2 = vmull_s16(vs01, viw01);
+                vsuml1 = vmlal_s16(vsuml1, vs10, viw10);
+                vsuml2 = vmlal_s16(vsuml2, vs11, viw11);
+                int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2);
+
+                int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
+
+                vst1_s16(Iptr + x, vsumnl);
+            }
+
+            internal::prefetch(dsrc + dstep * 2, 0);
+            for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
+            {
+#if __GNUC_MINOR__ < 0
+                __asm__ (
+                    "vld2.16 {d0-d1}, [%[dsrc00]]                         \n\t"
+                    "vld2.16 {d2-d3}, [%[dsrc10]]                         \n\t"
+                    "vld2.16 {d4-d5}, [%[dsrc01]]                         \n\t"
+                    "vld2.16 {d6-d7}, [%[dsrc11]]                         \n\t"
+                    "vmull.s16 q4, d3, %P[viw10]                           \n\t"
+                    "vmull.s16 q5, d0, %P[viw00]                           \n\t"
+                    "vmlal.s16 q4, d7, %P[viw11]                           \n\t"
+                    "vmlal.s16 q5, d4, %P[viw01]                           \n\t"
+                    "vmlal.s16 q4, d1, %P[viw00]                           \n\t"
+                    "vmlal.s16 q5, d2, %P[viw10]                           \n\t"
+                    "vmlal.s16 q4, d5, %P[viw01]                           \n\t"
+                    "vmlal.s16 q5, d6, %P[viw11]                            \n\t"
+                    "vrshrn.s32 d13, q4, %[W_BITS1]                       \n\t"
+                    "vrshrn.s32 d12, q5, %[W_BITS1]                       \n\t"
+                    "vmull.s16 q3, d13, d13                               \n\t"
+                    "vmull.s16 q4, d12, d12                               \n\t"
+                    "vmull.s16 q5, d13, d12                               \n\t"
+                    "vcvt.f32.s32 q3, q3                                  \n\t"
+                    "vcvt.f32.s32 q4, q4                                  \n\t"
+                    "vcvt.f32.s32 q5, q5                                  \n\t"
+                    "vadd.f32 %q[vA22], q3                                \n\t"
+                    "vadd.f32 %q[vA11], q4                                \n\t"
+                    "vadd.f32 %q[vA12], q5                                \n\t"
+                    "vst2.16 {d12-d13}, [%[out]]                          \n\t"
+                    : [vA22] "=w" (vA22),
+                      [vA11] "=w" (vA11),
+                      [vA12] "=w" (vA12)
+                    : "0" (vA22),
+                      "1" (vA11),
+                      "2" (vA12),
+                      [out] "r" (dIptr),
+                      [dsrc00] "r" (dsrc),
+                      [dsrc10] "r" (dsrc + dstep),
+                      [dsrc01] "r" (dsrc + cn2),
+                      [dsrc11] "r" (dsrc + dstep + cn2),
+                      [viw00] "w" (viw00),
+                      [viw10] "w" (viw10),
+                      [viw01] "w" (viw01),
+                      [viw11] "w" (viw11),
+                      [W_BITS1] "I" (W_BITS1)
+                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
+                );
+#else
+                int16x4x2_t vdsrc00 = vld2_s16(dsrc);
+                int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep);
+                int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2);
+                int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2);
+
+                int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10);
+                int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00);
+
+                vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11);
+                vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01);
+
+                vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00);
+                vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10);
+
+                vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01);
+                vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11);
+
+                int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1);
+                int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1);
+
+                int32x4_t va22i = vmull_s16(vsumny, vsumny);
+                int32x4_t va11i = vmull_s16(vsumnx, vsumnx);
+                int32x4_t va12i = vmull_s16(vsumnx, vsumny);
+
+                float32x4_t va22f = vcvtq_f32_s32(va22i);
+                float32x4_t va11f = vcvtq_f32_s32(va11i);
+                float32x4_t va12f = vcvtq_f32_s32(va12i);
+
+                vA22 = vaddq_f32(vA22, va22f);
+                vA11 = vaddq_f32(vA11, va11f);
+                vA12 = vaddq_f32(vA12, va12f);
+
+                int16x4x2_t vsum;
+                vsum.val[0] = vsumnx;
+                vsum.val[1] = vsumny;
+                vst2_s16(dIptr, vsum);
+#endif
+            }
+
+            for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 )
+            {
+                s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
+                                      src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5);
+                s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
+                                       dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
+                s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
+                                       dsrc[dstep+cn2+1]*iw11, W_BITS1);
+                Iptr[x] = (s16)ival;
+                dIptr[0] = (s16)ixval;
+                dIptr[1] = (s16)iyval;
+
+                A11 += (f32)(ixval*ixval);
+                A12 += (f32)(ixval*iyval);
+                A22 += (f32)(iyval*iyval);
+            }
+        }
+
+        f32 A11buf[2], A12buf[2], A22buf[2];
+        vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11)));
+        vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12)));
+        vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22)));
+        A11 += A11buf[0] + A11buf[1];
+        A12 += A12buf[0] + A12buf[1];
+        A22 += A22buf[0] + A22buf[1];
+
+        A11 *= FLT_SCALE;
+        A12 *= FLT_SCALE;
+        A22 *= FLT_SCALE;
+
+        f32 D = A11*A22 - A12*A12;
+        f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
+                        4.f*A12*A12))/(2*winSize.width*winSize.height);
+
+        if( err && getMinEigenVals )
+            err[ptidx] = (f32)minEig;
+
+        if( minEig < minEigThreshold || D < FLT_EPSILON )
+        {
+            if( level == 0 && status )
+                status[ptidx] = false;
+            continue;
+        }
+
+        D = 1.f/D;
+
+        nextPtX -= halfWinX;
+        nextPtY -= halfWinY;
+        f32 prevDeltaX = 0;
+        f32 prevDeltaY = 0;
+
+        for(u32 j = 0; j < terminationCount; j++ )
+        {
+            inextPtX = floor(nextPtX);
+            inextPtY = floor(nextPtY);
+
+            if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
+               inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
+            {
+                if( level == 0 && status )
+                    status[ptidx] = false;
+                break;
+            }
+
+            a = nextPtX - inextPtX;
+            b = nextPtY - inextPtY;
+            iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
+            iw01 = round(a*(1.f - b)*(1 << W_BITS));
+            iw10 = round((1.f - a)*b*(1 << W_BITS));
+            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            f32 b1 = 0, b2 = 0;
+
+            viw00 = vmov_n_s16((s16)iw00);
+            viw01 = vmov_n_s16((s16)iw01);
+            viw10 = vmov_n_s16((s16)iw10);
+            viw11 = vmov_n_s16((s16)iw11);
+
+            float32x4_t vb1 = vmovq_n_f32(0);
+            float32x4_t vb2 = vmovq_n_f32(0);
+
+            for(s32 y = 0; y < (s32)winSize.height; y++ )
+            {
+                const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn;
+                const s16* Iptr = IWinBuf + y*IWinBufStride;
+                const s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
+
+                x = 0;
+
+                internal::prefetch(Jptr, nextStride * 2);
+                internal::prefetch(Iptr, IWinBufStride/2);
+                internal::prefetch(dIptr, derivIWinBufStride/2);
+
+                for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 )
+                {
+                    uint8x8_t vj00 = vld1_u8(Jptr + x);
+                    uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride);
+                    uint8x8_t vj01 = vld1_u8(Jptr + x + cn);
+                    uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn);
+                    int16x8_t vI = vld1q_s16(Iptr + x);
+                    int16x8x2_t vDerivI = vld2q_s16(dIptr);
+
+                    int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00));
+                    int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10));
+                    int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01));
+                    int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11));
+
+                    int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
+                    int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
+
+                    vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
+                    vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
+
+                    vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
+                    vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
+
+                    vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
+                    vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
+
+                    int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
+                    int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
+
+                    int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI);
+
+                    int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0]));
+                    int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1]));
+                    int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0]));
+                    int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1]));
+
+                    float32x4_t vb1f = vcvtq_f32_s32(vb1i);
+                    float32x4_t vb2f = vcvtq_f32_s32(vb2i);
+
+                    vb1 = vaddq_f32(vb1, vb1f);
+                    vb2 = vaddq_f32(vb2, vb2f);
+                }
+
+                for( ; x < wwcn; x++, dIptr += 2 )
+                {
+                    s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                          Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
+                                          W_BITS1-5) - Iptr[x];
+                    b1 += (f32)(diff*dIptr[0]);
+                    b2 += (f32)(diff*dIptr[1]);
+                }
+            }
+
+            f32 bbuf[2];
+            float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2)));
+            vst1_f32(bbuf, vb);
+            b1 += bbuf[0];
+            b2 += bbuf[1];
+
+            b1 *= FLT_SCALE;
+            b2 *= FLT_SCALE;
+
+            f32 deltaX = (f32)((A12*b2 - A22*b1) * D);
+            f32 deltaY = (f32)((A12*b1 - A11*b2) * D);
+
+            nextPtX += deltaX;
+            nextPtY += deltaY;
+            nextPts[ptref+0] = nextPtX + halfWinX;
+            nextPts[ptref+1] = nextPtY + halfWinY;
+
+            if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon )
+                break;
+
+            if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 &&
+               std::abs(deltaY + prevDeltaY) < 0.01 )
+            {
+                nextPts[ptref+0] -= deltaX*0.5f;
+                nextPts[ptref+1] -= deltaY*0.5f;
+                break;
+            }
+            prevDeltaX = deltaX;
+            prevDeltaY = deltaY;
+        }
+
+        if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
+        {
+            f32 nextPointX = nextPts[ptref+0] - halfWinX;
+            f32 nextPointY = nextPts[ptref+1] - halfWinY;
+
+            s32 inextPointX = floor(nextPointX);
+            s32 inextPointY = floor(nextPointY);
+
+            if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width ||
+                inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height )
+            {
+                if( status )
+                    status[ptidx] = false;
+                continue;
+            }
+
+            f32 aa = nextPointX - inextPointX;
+            f32 bb = nextPointY - inextPointY;
+            iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS));
+            iw01 = round(aa*(1.f - bb)*(1 << W_BITS));
+            iw10 = round((1.f - aa)*bb*(1 << W_BITS));
+            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+            f32 errval = 0.f;
+
+            for(s32 y = 0; y < (s32)winSize.height; y++ )
+            {
+                const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn;
+                const s16* Iptr = IWinBuf + y*IWinBufStride;
+
+                for( x = 0; x < wwcn; x++ )
+                {
+                    s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
+                                          Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
+                                          W_BITS1-5) - Iptr[x];
+                    errval += std::abs((f32)diff);
+                }
+            }
+            err[ptidx] = errval / (32*wwcn*winSize.height);
+        }
+    }
+#else
+    (void)size;
+    (void)cn;
+    (void)prevData;
+    (void)prevStride;
+    (void)prevDerivData;
+    (void)prevDerivStride;
+    (void)nextData;
+    (void)nextStride;
+    (void)prevPts;
+    (void)nextPts;
+    (void)status;
+    (void)err;
+    (void)winSize;
+    (void)terminationCount;
+    (void)terminationEpsilon;
+    (void)level;
+    (void)maxLevel;
+    (void)useInitialFlow;
+    (void)getMinEigenVals;
+    (void)minEigThreshold;
+    (void)ptCount;
+#endif
+}
+
+}//CAROTENE_NS
+
--- a/3rdparty/carotene/src/phase.cpp
+++ b/3rdparty/carotene/src/phase.cpp
@ -0,0 +1,274 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <cfloat>
+#include <cmath>
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+#define FASTATAN2CONST(scale) \
+        f32 P1((f32)( 0.9997878412794807  * (180.0 / M_PI) * scale)), \
+        P3((f32)(-0.3258083974640975  * (180.0 / M_PI) * scale)), \
+        P5((f32)( 0.1555786518463281  * (180.0 / M_PI) * scale)), \
+        P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \
+         A_90((f32)(90.f * scale)), \
+        A_180((f32)(180.f * scale)), \
+        A_360((f32)(360.f * scale)); \
+        float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \
+         _90(vdupq_n_f32(A_90)), \
+        _180(vdupq_n_f32(A_180)), \
+        _360(vdupq_n_f32(A_360)), \
+           z(vdupq_n_f32(0.0f)), \
+        p1(vdupq_n_f32(P1)), \
+        p3(vdupq_n_f32(P3)), \
+        p5(vdupq_n_f32(P5)), \
+        p7(vdupq_n_f32(P7));
+
+#define FASTATAN2SCALAR(y, x, a) \
+    { \
+        f32 ax = std::abs(x), ay = std::abs(y); \
+        f32 c, c2; \
+        if (ax >= ay) \
+        { \
+            c = ay / (ax + (float)DBL_EPSILON); \
+            c2 = c * c; \
+            a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
+        } \
+        else \
+        { \
+            c = ax / (ay + (float)DBL_EPSILON); \
+            c2 = c * c; \
+            a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
+        } \
+        if (x < 0) \
+            a = A_180 - a; \
+        if (y < 0) \
+            a = A_360 - a; \
+    }
+
+#define FASTATAN2VECTOR(v_y, v_x, a) \
+    { \
+        float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \
+        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \
+        float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \
+        float32x4_t c2 = vmulq_f32(c, c); \
+        a = vmulq_f32(c2, p7); \
+ \
+        a = vmulq_f32(vaddq_f32(a, p5), c2); \
+        a = vmulq_f32(vaddq_f32(a, p3), c2); \
+        a = vmulq_f32(vaddq_f32(a, p1), c); \
+ \
+        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \
+        a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \
+        a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \
+ \
+    }
+
+} // namespace
+
+#endif
+
+void phase(const Size2D &size,
+           const s16 * src0Base, ptrdiff_t src0Stride,
+           const s16 * src1Base, ptrdiff_t src1Stride,
+           u8 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    FASTATAN2CONST(256.0f / 360.0f)
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    float32x4_t v_05 = vdupq_n_f32(0.5f);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw16; j += 16)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+
+            int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
+            int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+
+            // 0
+            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
+            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
+            float32x4_t v_dst32f0;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
+
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
+            float32x4_t v_dst32f1;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
+
+            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+
+            // 1
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
+
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
+
+            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+
+            vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
+                                          vmovn_u16(v_dst16s1)));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            int16x8_t v_src0 = vld1q_s16(src0 + j);
+            int16x8_t v_src1 = vld1q_s16(src1 + j);
+
+            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
+            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
+            float32x4_t v_dst32f0;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
+
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
+            float32x4_t v_dst32f1;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
+
+            uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+                                            vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+
+            vst1_u8(dst + j, vmovn_u16(v_dst));
+        }
+
+        for (; j < size.width; j++)
+        {
+            f32 x = src0[j], y = src1[j];
+            f32 a;
+            FASTATAN2SCALAR(y, x, a)
+            dst[j] = (u8)(s32)floor(a + 0.5f);
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void phase(const Size2D &size,
+           const f32 * src0Base, ptrdiff_t src0Stride,
+           const f32 * src1Base, ptrdiff_t src1Stride,
+           f32 * dstBase, ptrdiff_t dstStride,
+           f32 scale)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    FASTATAN2CONST(scale)
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw8; j += 8)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+
+            float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
+            float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);
+
+            float32x4_t v_dst32f;
+            // 0
+            FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
+            vst1q_f32(dst + j,     v_dst32f);
+            // 1
+            FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
+            vst1q_f32(dst + j + 4, v_dst32f);
+        }
+        if(j + 4 <= size.width)
+        {
+            float32x4_t v_src0 = vld1q_f32(src0 + j);
+            float32x4_t v_src1 = vld1q_f32(src1 + j);
+
+            float32x4_t v_dst32f;
+            FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
+            vst1q_f32(dst + j, v_dst32f);
+            j += 4;
+        }
+
+        for (; j < size.width; j++)
+        {
+            f32 a;
+            FASTATAN2SCALAR(src1[j], src0[j], a)
+            dst[j] = a;
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)scale;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/pyramid.cpp
+++ b/3rdparty/carotene/src/pyramid.cpp
--- a/3rdparty/carotene/src/reduce.cpp
+++ b/3rdparty/carotene/src/reduce.cpp
@ -0,0 +1,460 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+void reduceColSum(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  s32 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memset(dstBase, 0, size.width*sizeof(s32));
+    size_t i = 0;
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const u8* src_address = srcBase + i;
+
+        int32x4_t sll = vmovq_n_s32(0);
+        int32x4_t slh = vmovq_n_s32(0);
+        int32x4_t shl = vmovq_n_s32(0);
+        int32x4_t shh = vmovq_n_s32(0);
+
+        for (size_t h = 0; h < size.height; h += 256)
+        {
+            size_t lim = std::min(h + 256, size.height);
+
+            uint16x8_t sl = vmovq_n_u16(0);
+            uint16x8_t sh = vmovq_n_u16(0);
+
+            for (size_t k = h; k < lim; ++k, src_address += srcStride)
+            {
+                internal::prefetch(src_address + srcStride, 0);
+
+                uint8x16_t v = vld1q_u8(src_address);
+
+                sl = vaddw_u8(sl, vget_low_u8(v));
+                sh = vaddw_u8(sh, vget_high_u8(v));
+            }
+
+            int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl)));
+            int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl)));
+            int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh)));
+            int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh)));
+
+            sll = vqaddq_s32(sll, vsll);
+            slh = vqaddq_s32(slh, vslh);
+            shl = vqaddq_s32(shl, vshl);
+            shh = vqaddq_s32(shh, vshh);
+        }
+
+        vst1q_s32(dstBase + i + 0, sll);
+        vst1q_s32(dstBase + i + 4, slh);
+        vst1q_s32(dstBase + i + 8, shl);
+        vst1q_s32(dstBase + i + 12, shh);
+    }
+
+    for(size_t h = 0; h < size.height; ++h)
+    {
+        for(size_t j = i ; j < size.width; j++ )
+        {
+            if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu)
+                dstBase[j] = 0x7fFFffFF;
+        }
+    }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColMax(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width);
+    size_t i = 0;
+    for (; i + 16*4 <= size.width; i += 16*4)
+    {
+        const u8* src_address = srcBase + i;
+
+        uint8x16_t s1 = vld1q_u8(src_address + 0);
+        uint8x16_t s2 = vld1q_u8(src_address + 16);
+        uint8x16_t s3 = vld1q_u8(src_address + 32);
+        uint8x16_t s4 = vld1q_u8(src_address + 48);
+
+        src_address += srcStride;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
+        {
+            internal::prefetch(src_address + srcStride, 0);
+            internal::prefetch(src_address + srcStride, 32);
+
+            uint8x16_t v1 = vld1q_u8(src_address + 0);
+            uint8x16_t v2 = vld1q_u8(src_address + 16);
+            uint8x16_t v3 = vld1q_u8(src_address + 32);
+            uint8x16_t v4 = vld1q_u8(src_address + 48);
+
+            s1 = vmaxq_u8(s1, v1);
+            s2 = vmaxq_u8(s2, v2);
+            s3 = vmaxq_u8(s3, v3);
+            s4 = vmaxq_u8(s4, v4);
+        }
+
+        vst1q_u8(dstBase + i + 0, s1);
+        vst1q_u8(dstBase + i + 16, s2);
+        vst1q_u8(dstBase + i + 32, s3);
+        vst1q_u8(dstBase + i + 48, s4);
+    }
+
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const u8* src_address = srcBase + i;
+        uint8x16_t s1 = vld1q_u8(src_address);
+        src_address += srcStride;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
+        {
+            internal::prefetch(src_address + srcStride, 0);
+
+            uint8x16_t v1 = vld1q_u8(src_address);
+            s1 = vmaxq_u8(s1, v1);
+        }
+        vst1q_u8(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+            for(size_t j = i ; j < size.width; j++ )
+                dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColMin(const Size2D &size,
+                  const u8 * srcBase, ptrdiff_t srcStride,
+                  u8 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width);
+    size_t i = 0;
+    for (; i + 16*4 <= size.width; i += 16*4)
+    {
+        const u8* src_address = srcBase + i;
+
+        uint8x16_t s1 = vld1q_u8(src_address + 0);
+        uint8x16_t s2 = vld1q_u8(src_address + 16);
+        uint8x16_t s3 = vld1q_u8(src_address + 32);
+        uint8x16_t s4 = vld1q_u8(src_address + 48);
+
+        src_address += srcStride;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
+        {
+            internal::prefetch(src_address + srcStride, 0);
+            internal::prefetch(src_address + srcStride, 32);
+
+            uint8x16_t v1 = vld1q_u8(src_address + 0);
+            uint8x16_t v2 = vld1q_u8(src_address + 16);
+            uint8x16_t v3 = vld1q_u8(src_address + 32);
+            uint8x16_t v4 = vld1q_u8(src_address + 48);
+
+            s1 = vminq_u8(s1, v1);
+            s2 = vminq_u8(s2, v2);
+            s3 = vminq_u8(s3, v3);
+            s4 = vminq_u8(s4, v4);
+        }
+
+        vst1q_u8(dstBase + i + 0, s1);
+        vst1q_u8(dstBase + i + 16, s2);
+        vst1q_u8(dstBase + i + 32, s3);
+        vst1q_u8(dstBase + i + 48, s4);
+    }
+
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const u8* src_address = srcBase + i;
+        uint8x16_t s1 = vld1q_u8(src_address);
+        src_address += srcStride;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
+        {
+            internal::prefetch(src_address + srcStride, 0);
+
+            uint8x16_t v1 = vld1q_u8(src_address);
+            s1 = vminq_u8(s1, v1);
+        }
+        vst1q_u8(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+            for(size_t j = i ; j < size.width; j++ )
+                dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColSum(const Size2D &size,
+                  const f32 * srcBase, ptrdiff_t srcStride,
+                  f32 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width*sizeof(f32));
+    size_t srcstep = srcStride/sizeof(f32);
+    size_t i = 0;
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const f32* src_address = srcBase + i;
+
+        float32x4_t s1 = vld1q_f32(src_address + 0);
+        float32x4_t s2 = vld1q_f32(src_address + 4);
+        float32x4_t s3 = vld1q_f32(src_address + 8);
+        float32x4_t s4 = vld1q_f32(src_address + 12);
+
+        src_address += srcstep;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+            internal::prefetch(src_address + srcstep, 32);
+
+            float32x4_t v1 = vld1q_f32(src_address + 0);
+            float32x4_t v2 = vld1q_f32(src_address + 4);
+            float32x4_t v3 = vld1q_f32(src_address + 8);
+            float32x4_t v4 = vld1q_f32(src_address + 12);
+
+            s1 = vaddq_f32(s1, v1);
+            s2 = vaddq_f32(s2, v2);
+            s3 = vaddq_f32(s3, v3);
+            s4 = vaddq_f32(s4, v4);
+        }
+
+        vst1q_f32(dstBase + i + 0, s1);
+        vst1q_f32(dstBase + i + 4, s2);
+        vst1q_f32(dstBase + i + 8, s3);
+        vst1q_f32(dstBase + i + 12, s4);
+    }
+
+    for (; i + 4 <= size.width; i += 4)
+    {
+        const f32* src_address = srcBase + i;
+        float32x4_t s1 = vld1q_f32(src_address);
+        src_address += srcstep;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+
+            float32x4_t v1 = vld1q_f32(src_address);
+            s1 = vaddq_f32(s1, v1);
+        }
+        vst1q_f32(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+        {
+            for(size_t j = i ; j < size.width; j++ )
+            {
+                dstBase[j] += srcBase[j + srcstep * h];
+            }
+        }
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColMax(const Size2D &size,
+                  const f32 * srcBase, ptrdiff_t srcStride,
+                  f32 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width*sizeof(f32));
+    size_t srcstep = srcStride/sizeof(f32);
+    size_t i = 0;
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const f32* src_address = srcBase + i;
+
+        float32x4_t s1 = vld1q_f32(src_address + 0);
+        float32x4_t s2 = vld1q_f32(src_address + 4);
+        float32x4_t s3 = vld1q_f32(src_address + 8);
+        float32x4_t s4 = vld1q_f32(src_address + 12);
+
+        src_address += srcstep;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+            internal::prefetch(src_address + srcstep, 32);
+
+            float32x4_t v1 = vld1q_f32(src_address + 0);
+            float32x4_t v2 = vld1q_f32(src_address + 4);
+            float32x4_t v3 = vld1q_f32(src_address + 8);
+            float32x4_t v4 = vld1q_f32(src_address + 12);
+
+            s1 = vmaxq_f32(s1, v1);
+            s2 = vmaxq_f32(s2, v2);
+            s3 = vmaxq_f32(s3, v3);
+            s4 = vmaxq_f32(s4, v4);
+        }
+
+        vst1q_f32(dstBase + i + 0, s1);
+        vst1q_f32(dstBase + i + 4, s2);
+        vst1q_f32(dstBase + i + 8, s3);
+        vst1q_f32(dstBase + i + 12, s4);
+    }
+
+    for (; i + 4 <= size.width; i += 4)
+    {
+        const f32* src_address = srcBase + i;
+        float32x4_t s1 = vld1q_f32(src_address);
+        src_address += srcstep;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+
+            float32x4_t v1 = vld1q_f32(src_address);
+            s1 = vmaxq_f32(s1, v1);
+        }
+        vst1q_f32(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+            for(size_t j = i ; j < size.width; j++ )
+                dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+void reduceColMin(const Size2D &size,
+                  const f32 * srcBase, ptrdiff_t srcStride,
+                  f32 * dstBase)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    memcpy(dstBase, srcBase, size.width*sizeof(f32));
+    size_t srcstep = srcStride/sizeof(f32);
+    size_t i = 0;
+    for (; i + 16 <= size.width; i += 16)
+    {
+        const f32* src_address = srcBase + i;
+
+        float32x4_t s1 = vld1q_f32(src_address + 0);
+        float32x4_t s2 = vld1q_f32(src_address + 4);
+        float32x4_t s3 = vld1q_f32(src_address + 8);
+        float32x4_t s4 = vld1q_f32(src_address + 12);
+
+        src_address += srcstep;
+
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+            internal::prefetch(src_address + srcstep, 32);
+
+            float32x4_t v1 = vld1q_f32(src_address + 0);
+            float32x4_t v2 = vld1q_f32(src_address + 4);
+            float32x4_t v3 = vld1q_f32(src_address + 8);
+            float32x4_t v4 = vld1q_f32(src_address + 12);
+
+            s1 = vminq_f32(s1, v1);
+            s2 = vminq_f32(s2, v2);
+            s3 = vminq_f32(s3, v3);
+            s4 = vminq_f32(s4, v4);
+        }
+
+        vst1q_f32(dstBase + i + 0, s1);
+        vst1q_f32(dstBase + i + 4, s2);
+        vst1q_f32(dstBase + i + 8, s3);
+        vst1q_f32(dstBase + i + 12, s4);
+    }
+
+    for (; i + 4 <= size.width; i += 4)
+    {
+        const f32* src_address = srcBase + i;
+        float32x4_t s1 = vld1q_f32(src_address);
+        src_address += srcstep;
+        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
+        {
+            internal::prefetch(src_address + srcstep, 0);
+
+            float32x4_t v1 = vld1q_f32(src_address);
+            s1 = vminq_f32(s1, v1);
+        }
+        vst1q_f32(dstBase + i, s1);
+    }
+
+    if (i < size.width)
+        for(size_t h = 1; h < size.height; ++h)
+            for(size_t j = i ; j < size.width; j++ )
+                dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]);
+#else
+    (void)size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/remap.cpp
+++ b/3rdparty/carotene/src/remap.cpp
@ -0,0 +1,694 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "remap.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace internal {
+
+void remapNearestNeighborReplicate(const Size2D size,
+                                   const u8 * srcBase,
+                                   const s32 * map,
+                                   u8 * dstBase, ptrdiff_t dstStride)
+{
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
+        u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
+
+        for (size_t x = 0; x < size.width; ++x)
+        {
+            dst_row[x] = srcBase[map_row[x]];
+        }
+    }
+}
+
+void remapNearestNeighborConst(const Size2D size,
+                               const u8 * srcBase,
+                               const s32 * map,
+                               u8 * dstBase, ptrdiff_t dstStride,
+                               u8 borderValue)
+{
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
+        u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
+
+        for (size_t x = 0; x < size.width; ++x)
+        {
+            s32 src_idx = map_row[x];
+            dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
+        }
+    }
+}
+
+void remapLinearReplicate(const Size2D size,
+                          const u8 * srcBase,
+                          const s32 * map,
+                          const f32 * coeffs,
+                          u8 * dstBase, ptrdiff_t dstStride)
+{
+    int16x8_t v_zero16 = vdupq_n_s16(0);
+
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
+        const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
+
+        u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
+
+        size_t x = 0;
+        for ( ; x + 8 < size.width; x += 8)
+        {
+            int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
+            v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
+
+            int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
+            v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
+
+            int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
+            v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
+
+            int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
+            v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
+
+            // first part
+            float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
+            float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
+
+            float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
+            float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
+                                                                               vget_low_s16(v_src00))), v_coeff.val[0]);
+            float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
+                                                                               vget_low_s16(v_src10))), v_coeff.val[0]);
+
+            float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // second part
+            v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
+            v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
+
+            v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
+            v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
+                                                                   vget_high_s16(v_src00))), v_coeff.val[0]);
+            v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
+                                                                   vget_high_s16(v_src10))), v_coeff.val[0]);
+
+            v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // store
+            vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
+        }
+
+        for ( ; x < size.width; ++x)
+        {
+            s32 src00_index = map_row[(x << 2)];
+            s32 src10_index = map_row[(x << 2) + 2];
+            f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
+                             srcBase[src00_index];
+            f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
+                             srcBase[src10_index];
+            dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
+        }
+    }
+}
+
+void remapLinearConst(const Size2D size,
+                      const u8 * srcBase,
+                      const s32 * map,
+                      const f32 * coeffs,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      u8 borderValue)
+{
+    int16x8_t v_zero16 = vdupq_n_s16(0);
+
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
+        const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
+
+        u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
+
+        size_t x = 0;
+        for ( ; x + 8 < size.width; x += 8)
+        {
+            int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) +  4] >= 0 ? srcBase[map_row[(x << 2) +  4]] : borderValue, v_src00, 1);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) +  8] >= 0 ? srcBase[map_row[(x << 2) +  8]] : borderValue, v_src00, 2);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
+
+            int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) +  5] >= 0 ? srcBase[map_row[(x << 2) +  5]] : borderValue, v_src01, 1);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) +  9] >= 0 ? srcBase[map_row[(x << 2) +  9]] : borderValue, v_src01, 2);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
+
+            int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) +  6] >= 0 ? srcBase[map_row[(x << 2) +  6]] : borderValue, v_src10, 1);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
+
+            int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) +  7] >= 0 ? srcBase[map_row[(x << 2) +  7]] : borderValue, v_src11, 1);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
+
+            // first part
+            float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
+            float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
+
+            float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
+            float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
+                                                                               vget_low_s16(v_src00))), v_coeff.val[0]);
+            float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
+                                                                               vget_low_s16(v_src10))), v_coeff.val[0]);
+
+            float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // second part
+            v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
+            v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
+
+            v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
+            v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
+                                                                   vget_high_s16(v_src00))), v_coeff.val[0]);
+            v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
+                                                                   vget_high_s16(v_src10))), v_coeff.val[0]);
+
+            v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // store
+            vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
+        }
+
+        for ( ; x < size.width; ++x)
+        {
+            s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
+            s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
+            s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
+            s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
+
+            f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
+            f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
+            dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
+        }
+    }
+}
+
+} // namespace internal
+
+#endif // CAROTENE_NEON
+
+bool isRemapNearestNeighborSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+bool isRemapLinearSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                          const u8 * srcBase, ptrdiff_t srcStride,
+                          const f32 * tableBase, ptrdiff_t tableStride,
+                          u8 * dstBase, ptrdiff_t dstStride,
+                          BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
+    s32 * map = alignPtr(_map, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride);
+    int32x2_t v_step2 = vdup_n_s32(srcStride);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+        int32x2_t v_zero2 = vdup_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0;
+                    for ( ; x + 8 <= blockWidth; x += 8)
+                    {
+                        float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
+                                      v_table1 = vld2q_f32(table_row + (x << 1) + 8);
+
+                        int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
+                        int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
+                        int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
+                        vst1q_s32(map_row + x, v_dst_index);
+
+                        v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
+                        v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
+                        v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
+                        vst1q_s32(map_row + x + 4, v_dst_index);
+                    }
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
+
+                        int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
+                        int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
+                        int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
+                        vst1q_s32(map_row + x, v_dst_index);
+                    }
+
+                    for ( ; x + 2 <= blockWidth; x += 2)
+                    {
+                        float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
+
+                        int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
+                        int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
+                        int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
+                        vst1_s32(map_row + x, v_dst_index);
+                    }
+
+                    for ( ; x < blockWidth; ++x)
+                    {
+                        s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
+                        s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
+                        map_row[x] = src_y * srcStride + src_x;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                              getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+        int32x2_t v_m1_2 = vdup_n_s32(-1);
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        float32x2_t v_zero2 = vdup_n_f32(0.0f);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0;
+                    for ( ; x + 8 <= blockWidth; x += 8)
+                    {
+                        float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
+                                      v_table1 = vld2q_f32(table_row + (x << 1) + 8);
+
+                        int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
+                        int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
+                        uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
+                                                      vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
+                        int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x, v_dst_index);
+
+                        v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
+                        v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
+                        v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
+                                           vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
+                        v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x + 4, v_dst_index);
+                    }
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
+
+                        int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
+                        int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
+                        uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
+                                                      vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
+                        int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x, v_dst_index);
+                    }
+
+                    for ( ; x + 2 <= blockWidth; x += 2)
+                    {
+                        float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
+
+                        int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
+                        int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
+                        uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
+                                                     vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
+                        int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
+                        vst1_s32(map_row + x, v_dst_index);
+                    }
+
+                    for ( ; x < blockWidth; ++x)
+                    {
+                        s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
+                        s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
+                        map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
+                                     (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                          getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)tableBase;
+    (void)tableStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+void remapLinear(const Size2D &ssize, const Size2D &dsize,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 const f32 * tableBase, ptrdiff_t tableStride,
+                 u8 * dstBase, ptrdiff_t dstStride,
+                 BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
+    f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
+
+    s32 * map = alignPtr(_map, 16);
+    f32 * coeffs = alignPtr(_coeffs, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
+    float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
+
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0;
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
+
+                        float32x4x2_t  v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
+                        v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
+                        v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
+
+                        int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
+                        int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
+                        int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
+                        int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+                    }
+
+                    for ( ; x < blockWidth; ++x)
+                    {
+                        f32 src_x_f = table_row[(x << 1) + 0];
+                        f32 src_y_f = table_row[(x << 1) + 1];
+
+                        s32 src0_x = (s32)floorf(src_x_f);
+                        s32 src0_y = (s32)floorf(src_y_f);
+
+                        coeff_row[x << 1] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
+                        src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
+                        s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
+                        src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
+
+                        map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
+                        map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
+                        map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
+                        map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
+                    }
+                }
+
+                remapLinearReplicate(Size2D(blockWidth, blockHeight),
+                                     srcBase, &map[0], &coeffs[0],
+                                     getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
+
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0;
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
+
+                        int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
+                        int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
+                        v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
+                        v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
+
+                        int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
+                        int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
+
+                        uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
+                        uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
+                        uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
+                        uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
+
+                        v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
+                        v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
+                        v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
+                        v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+                    }
+
+                    for ( ; x < blockWidth; ++x)
+                    {
+                        f32 src_x_f = table_row[(x << 1) + 0];
+                        f32 src_y_f = table_row[(x << 1) + 1];
+
+                        s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
+                        s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
+
+                        coeff_row[(x << 1)] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
+                        map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
+                    }
+                }
+
+                remapLinearConst(Size2D(blockWidth, blockHeight),
+                                 srcBase, &map[0], &coeffs[0],
+                                 getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)tableBase;
+    (void)tableStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/remap.hpp
+++ b/3rdparty/carotene/src/remap.hpp
@ -0,0 +1,85 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SRC_REMAP_HPP
+#define CAROTENE_SRC_REMAP_HPP
+
+#include "common.hpp"
+
+#include <cmath>
+
+#ifdef CAROTENE_NEON
+
+namespace CAROTENE_NS { namespace internal {
+
+enum
+{
+    BLOCK_SIZE = 32
+};
+
+
+void remapNearestNeighborReplicate(const Size2D size,
+                                   const u8 * srcBase,
+                                   const s32 * map,
+                                   u8 * dstBase, ptrdiff_t dstStride);
+
+void remapNearestNeighborConst(const Size2D size,
+                               const u8 * srcBase,
+                               const s32 * map,
+                               u8 * dstBase, ptrdiff_t dstStride,
+                               u8 borderValue);
+
+void remapLinearReplicate(const Size2D size,
+                          const u8 * srcBase,
+                          const s32 * map,
+                          const f32 * coeffs,
+                          u8 * dstBase, ptrdiff_t dstStride);
+
+void remapLinearConst(const Size2D size,
+                      const u8 * srcBase,
+                      const s32 * map,
+                      const f32 * coeffs,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      u8 borderValue);
+
+} }
+
+#endif // CAROTENE_NEON
+
+#endif // CAROTENE_SRC_REMAP_HPP
--- a/3rdparty/carotene/src/resize.cpp
+++ b/3rdparty/carotene/src/resize.cpp
--- a/3rdparty/carotene/src/saturate_cast.hpp
+++ b/3rdparty/carotene/src/saturate_cast.hpp
@ -0,0 +1,199 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SATURATE_CAST_HPP
+#define CAROTENE_SATURATE_CAST_HPP
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+
+#if defined _MSC_VER && defined _M_ARM
+# include <intrin.h>
+#endif
+
+#include <carotene/definitions.hpp>
+#include <carotene/types.hpp>
+
+namespace CAROTENE_NS { namespace internal {
+
+#if defined _MSC_VER && defined _M_ARM
+
+__declspec(naked) static void vcvtr_s32_f64_imp(f64 d)
+{
+    (void)d;
+    __emit(0xEEBD);  // vcvtr.s32.f64 s0, d0
+    __emit(0x0B40);
+    __emit(0xEE10);  // vmov r0, s0
+    __emit(0x0A10);
+    __emit(0x4770);  // bx lr
+}
+
+# define CAROTENE_ROUND_FLT(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)((f64)x);
+# define CAROTENE_ROUND_DBL(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)(x);
+
+#elif defined CV_ICC || defined __GNUC__
+
+# if defined(__VFP_FP__) && !defined(__SOFTFP__) && !(defined _DEBUG || defined DEBUG) && !defined(__CUDACC__)
+#  define CAROTENE_ROUND_FLT(value) {                              \
+    register union { f32 f; s32 i; } result;                    \
+    asm ("ftosis  %0, %1 \n" : "=w" (result.f) : "w" (value) ); \
+    return result.i; }
+#  define CAROTENE_ROUND_DBL(value) {                      \
+    register union {f32 f; s32 i;} __tegra_result;      \
+    asm (                                               \
+        "ftosid  %0, %P1\n"                             \
+        : "=w" (__tegra_result.f)                       \
+        : "w" (value)                                   \
+    );                                                  \
+    return __tegra_result.i;                            \
+    }
+# else
+#  define CAROTENE_ROUND_FLT(x) return (s32)lrintf(value);
+#  define CAROTENE_ROUND_DBL(value) return (s32)lrint(value);
+# endif
+
+#endif
+
+inline s32 round(f32 value)
+{
+#ifdef CAROTENE_ROUND_FLT
+    CAROTENE_ROUND_FLT(value)
+#else
+    s32 intpart = (s32)(value);
+    f32 fractpart = value - intpart;
+    if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
+        return (s32)(value + (value >= 0 ? 0.5 : -0.5));
+    else
+        return intpart;
+#endif
+}
+
+inline s32 round(f64 value)
+{
+#ifdef CAROTENE_ROUND_DBL
+    CAROTENE_ROUND_DBL(value)
+#else
+    s32 intpart = (s32)(value);
+    f64 fractpart = value - intpart;
+    if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
+        return (s32)(value + (value >= 0 ? 0.5 : -0.5));
+    else
+        return intpart;
+#endif
+}
+/////////////// saturate_cast (used in image & signal processing) ///////////////////
+
+template<typename _Tp> inline _Tp saturate_cast(u8 v)    { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(s8 v)    { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(u16 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(s16 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(u32 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(s32 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(s64 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(u64 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(f32 v)   { return _Tp(v); }
+template<typename _Tp> inline _Tp saturate_cast(f64 v)   { return _Tp(v); }
+
+template<> inline u8 saturate_cast<u8>(s8 v)      { return (u8)std::max((s32)v, 0); }
+template<> inline u8 saturate_cast<u8>(u16 v)     { return (u8)std::min((u32)v, (u32)UCHAR_MAX); }
+template<> inline u8 saturate_cast<u8>(s32 v)     { return (u8)((u32)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline u8 saturate_cast<u8>(s16 v)     { return saturate_cast<u8>((s32)v); }
+template<> inline u8 saturate_cast<u8>(u32 v)     { return (u8)std::min(v, (u32)UCHAR_MAX); }
+template<> inline u8 saturate_cast<u8>(s64 v)     { return (u8)((u64)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline u8 saturate_cast<u8>(u64 v)     { return (u8)std::min(v, (u64)UCHAR_MAX); }
+template<> inline u8 saturate_cast<u8>(f32 v)     { return saturate_cast<u8>(round(v)); }
+template<> inline u8 saturate_cast<u8>(f64 v)     { return saturate_cast<u8>(round(v)); }
+
+template<> inline s8 saturate_cast<s8>(u8 v)      { return (s8)std::min((s32)v, SCHAR_MAX); }
+template<> inline s8 saturate_cast<s8>(u16 v)     { return (s8)std::min((u32)v, (u32)SCHAR_MAX); }
+template<> inline s8 saturate_cast<s8>(s32 v)     { return (s8)((u32)(v-SCHAR_MIN) <= (u32)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline s8 saturate_cast<s8>(s16 v)     { return saturate_cast<s8>((s32)v); }
+template<> inline s8 saturate_cast<s8>(u32 v)     { return (s8)std::min(v, (u32)SCHAR_MAX); }
+template<> inline s8 saturate_cast<s8>(s64 v)     { return (s8)((u64)(v-SCHAR_MIN) <= (u64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline s8 saturate_cast<s8>(u64 v)     { return (s8)std::min(v, (u64)SCHAR_MAX); }
+template<> inline s8 saturate_cast<s8>(f32 v)     { return saturate_cast<s8>(round(v)); }
+template<> inline s8 saturate_cast<s8>(f64 v)     { return saturate_cast<s8>(round(v)); }
+
+template<> inline u16 saturate_cast<u16>(s8 v)    { return (u16)std::max((s32)v, 0); }
+template<> inline u16 saturate_cast<u16>(s16 v)   { return (u16)std::max((s32)v, 0); }
+template<> inline u16 saturate_cast<u16>(s32 v)   { return (u16)((u32)v <= (u32)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline u16 saturate_cast<u16>(u32 v)   { return (u16)std::min(v, (u32)USHRT_MAX); }
+template<> inline u16 saturate_cast<u16>(s64 v)   { return (u16)((u64)v <= (u64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline u16 saturate_cast<u16>(u64 v)   { return (u16)std::min(v, (u64)USHRT_MAX); }
+template<> inline u16 saturate_cast<u16>(f32 v)   { return saturate_cast<u16>(round(v)); }
+template<> inline u16 saturate_cast<u16>(f64 v)   { return saturate_cast<u16>(round(v)); }
+
+template<> inline s16 saturate_cast<s16>(u16 v)   { return (s16)std::min((s32)v, SHRT_MAX); }
+template<> inline s16 saturate_cast<s16>(s32 v)   { return (s16)((u32)(v - SHRT_MIN) <= (u32)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline s16 saturate_cast<s16>(u32 v)   { return (s16)std::min(v, (u32)SHRT_MAX); }
+template<> inline s16 saturate_cast<s16>(s64 v)   { return (s16)((u64)(v - SHRT_MIN) <= (u64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline s16 saturate_cast<s16>(u64 v)   { return (s16)std::min(v, (u64)SHRT_MAX); }
+template<> inline s16 saturate_cast<s16>(f32 v)   { return saturate_cast<s16>(round(v)); }
+template<> inline s16 saturate_cast<s16>(f64 v)   { return saturate_cast<s16>(round(v)); }
+
+template<> inline u32 saturate_cast<u32>(s8 v)    { return (u32)std::max(v, (s8)0); }
+template<> inline u32 saturate_cast<u32>(s16 v)   { return (u32)std::max(v, (s16)0); }
+template<> inline u32 saturate_cast<u32>(s32 v)   { return (u32)std::max(v, (s32)0); }
+template<> inline u32 saturate_cast<u32>(s64 v)   { return (u32)((u64)v <= (u64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
+template<> inline u32 saturate_cast<u32>(u64 v)   { return (u32)std::min(v, (u64)UINT_MAX); }
+//OpenCV like f32/f64 -> u32 conversion
+//we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+template<> inline u32 saturate_cast<u32>(f32 v)   { return round(v); }
+template<> inline u32 saturate_cast<u32>(f64 v)   { return round(v); }
+//Negative clipping implementation
+//template<> inline u32 saturate_cast<u32>(f32 v)   { return saturate_cast<u32>(round(v)); }
+//template<> inline u32 saturate_cast<u32>(f64 v)   { return saturate_cast<u32>(round(v)); }
+
+template<> inline s32 saturate_cast<s32>(u32 v)   { return (s32)std::min(v, (u32)INT_MAX); }
+template<> inline s32 saturate_cast<s32>(s64 v)   { return (s32)((u64)(v - INT_MIN) <= (u64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); }
+template<> inline s32 saturate_cast<s32>(u64 v)   { return (s32)std::min(v, (u64)INT_MAX); }
+template<> inline s32 saturate_cast<s32>(f32 v)   { return round(v); }
+template<> inline s32 saturate_cast<s32>(f64 v)   { return round(v); }
+
+template<> inline u64 saturate_cast<u64>(s8 v)    { return (u64)std::max(v, (s8)0); }
+template<> inline u64 saturate_cast<u64>(s16 v)   { return (u64)std::max(v, (s16)0); }
+template<> inline u64 saturate_cast<u64>(s32 v)   { return (u64)std::max(v, (s32)0); }
+template<> inline u64 saturate_cast<u64>(s64 v)   { return (u64)std::max(v, (s64)0); }
+
+template<> inline s64 saturate_cast<s64>(u64 v)   { return (s64)std::min(v, (u64)LLONG_MAX); }
+
+} }
+
+#endif
--- a/3rdparty/carotene/src/scharr.cpp
+++ b/3rdparty/carotene/src/scharr.cpp
@ -0,0 +1,219 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <vector>
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
+{
+    return (dx == 0 && dy == 1 &&
+                   isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
+           (dx == 1 && dy == 0 &&
+                   isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
+}
+
+void Scharr3x3(const Size2D &size,
+               const u8 * srcBase, ptrdiff_t srcStride,
+               s16 * dstBase, ptrdiff_t dstStride,
+               s32 dx, s32 dy,
+               BORDER_MODE border, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
+#ifdef CAROTENE_NEON
+    static s16 dw[] = {3, 10, 3};
+
+    if (dy == 1)
+        SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
+                           3, 1, dw, 0,
+                           border, borderValue, borderMargin);
+    else
+        SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
+                           1, 3, 0, dw,
+                           border, borderValue, borderMargin);
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+void ScharrDeriv(const Size2D &size, s32 cn,
+                 const u8 * srcBase, ptrdiff_t srcStride,
+                 s16 * dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t colsn = size.width*cn;
+    size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
+
+    ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
+    std::vector<s16> _tempBuf((delta << 1) + 64);
+    s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
+
+    int16x8_t vc3 = vmovq_n_s16(3);
+    int16x8_t vc10 = vmovq_n_s16(10);
+    uint8x8_t v8c10 = vmov_n_u8(10);
+
+    for(size_t y = 0; y < size.height; y++ )
+    {
+        const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
+        const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
+        s16* drow = internal::getRowPtr(dstBase, dstStride, y);
+
+        // do vertical convolution
+        size_t x = 0;
+        for( ; x < roiw8; x += 8 )
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+#if __GNUC_MINOR__ < 7
+            __asm__ (
+                "vld1.8 {d0}, [%[src0]]                                \n\t"
+                "vld1.8 {d2}, [%[src2]]                                \n\t"
+                "vld1.8 {d1}, [%[src1]]                                \n\t"
+                "vaddl.u8 q2, d2, d0                                   \n\t"
+                "vmull.u8 q3, d1, %[vc10]                              \n\t"
+                "vsubl.u8 q4, d2, d0                                   \n\t"
+                "vmla.s16 q3, q2, %q[vc3]                              \n\t"
+                "vst1.16 {d8-d9}, [%[out1],:128]                       \n\t"
+                "vst1.16 {d6-d7}, [%[out0],:128]                       \n\t"
+                :
+                : [out0] "r" (trow0 + x),
+                  [out1] "r" (trow1 + x),
+                  [src0] "r" (srow0 + x),
+                  [src1] "r" (srow1 + x),
+                  [src2] "r" (srow2 + x),
+                  [vc10] "w" (v8c10), [vc3] "w" (vc3)
+                : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
+            );
+#else
+            uint8x8_t s0 = vld1_u8(srow0 + x);
+            uint8x8_t s1 = vld1_u8(srow1 + x);
+            uint8x8_t s2 = vld1_u8(srow2 + x);
+
+            int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
+            int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
+            int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
+            int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
+
+            vst1q_s16(trow1 + x, t1);
+            vst1q_s16(trow0 + x, t0);
+#endif
+        }
+        for( ; x < colsn; x++ )
+        {
+            trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
+            trow1[x] = (s16)(srow2[x] - srow0[x]);
+        }
+
+        // make border
+        size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
+        for( s32 k = 0; k < cn; k++ )
+        {
+            trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
+            trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
+        }
+
+        // do horizontal convolution, interleave the results and store them to dst
+        x = 0;
+        for( ; x < roiw8; x += 8 )
+        {
+#if __GNUC_MINOR__ < 6
+            __asm__ (
+                "vld1.16 {d4-d5}, [%[s2ptr]]                           \n\t"
+                "vld1.16 {d8-d9}, [%[s4ptr]]                           \n\t"
+                "vld1.16 {d6-d7}, [%[s3ptr],:128]                      \n\t"
+                "vld1.16 {d0-d1}, [%[s0ptr]]                           \n\t"
+                "vld1.16 {d2-d3}, [%[s1ptr]]                           \n\t"
+                "vadd.i16 q7, q2, q4                                   \n\t"
+                "vmul.s16 q6, q3, %q[vc10]                             \n\t"
+                "vsub.s16 q5, q1, q0                                   \n\t"
+                "vmla.s16 q6, q7, %q[vc3]                              \n\t"
+                "vst2.16 {d10-d13}, [%[out]]                           \n\t"
+                :
+                : [out] "r" (drow + x * 2),
+                  [s0ptr] "r" (trow0 + x - cn),
+                  [s1ptr] "r" (trow0 + x + cn),
+                  [s2ptr] "r" (trow1 + x - cn),
+                  [s3ptr] "r" (trow1 + x),
+                  [s4ptr] "r" (trow1 + x + cn),
+                  [vc10] "w" (vc10), [vc3] "w" (vc3)
+                : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
+            );
+#else
+            int16x8_t s0 = vld1q_s16(trow0 + x - cn);
+            int16x8_t s1 = vld1q_s16(trow0 + x + cn);
+            int16x8_t s2 = vld1q_s16(trow1 + x - cn);
+            int16x8_t s3 = vld1q_s16(trow1 + x);
+            int16x8_t s4 = vld1q_s16(trow1 + x + cn);
+
+            int16x8_t s3x10 = vmulq_s16(s3, vc10);
+            int16x8_t s24 = vaddq_s16(s2, s4);
+
+            int16x8x2_t vr;
+            vr.val[0] = vsubq_s16(s1, s0);
+            vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
+
+            vst2q_s16(drow + x*2, vr);
+#endif //__GNUC_MINOR__ < 6
+        }
+        for( ; x < colsn; x++ )
+        {
+            drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
+            drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
+        }
+    }
+#else
+    (void)size;
+    (void)cn;
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/separable_filter.cpp
+++ b/3rdparty/carotene/src/separable_filter.cpp
@ -0,0 +1,109 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "separable_filter.hpp"
+
+namespace CAROTENE_NS {
+
+bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
+{
+    return isSupportedConfiguration() &&
+        size.width >= 9 && size.height >= 1 &&
+        (size.height + borderMargin.top + borderMargin.bottom) >= 2  &&
+        (dx >= 0) && (dx < 4) && (dy >= 0) && (dy < 4) &&
+        (border == BORDER_MODE_CONSTANT   ||
+         border == BORDER_MODE_REFLECT    ||
+         border == BORDER_MODE_REFLECT101 ||
+         border == BORDER_MODE_REPLICATE   );
+}
+
+void SeparableFilter3x3(const Size2D &size,
+                        const u8 * srcBase, ptrdiff_t srcStride,
+                        s16 * dstBase, ptrdiff_t dstStride,
+                        const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw,
+                        BORDER_MODE border, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isSeparableFilter3x3Supported(size, border, rowFilter, colFilter, borderMargin));
+#ifdef CAROTENE_NEON
+    if(!((xw || rowFilter < 3) && (yw || colFilter < 3)))
+        std::abort();//Couldn't call generic filter without provided weights
+
+    typedef void (*sepFilter3x3_8u16s_func)(const Size2D&, const u8*, ptrdiff_t, s16*, ptrdiff_t,
+                                            const s16*, const s16*, BORDER_MODE, u8, Margin);
+
+    static sepFilter3x3_8u16s_func quickFilters[4][4]=
+    {
+    /*d0y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121,    internal::ColFilter3x3S16_121>::process,
+             /*dx*/  internal::sepFilter3x3<internal::RowFilter3x3S16_m101,   internal::ColFilter3x3S16_121>::process,
+             /*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21,   internal::ColFilter3x3S16_121>::process,
+             /*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_121>::process},
+
+    /*dy */{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121,    internal::ColFilter3x3S16_m101>::process,
+             /*dx*/  internal::sepFilter3x3<internal::RowFilter3x3S16_m101,   internal::ColFilter3x3S16_m101>::process,
+             /*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21,   internal::ColFilter3x3S16_m101>::process,
+             /*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_m101>::process},
+
+    /*d2y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121,    internal::ColFilter3x3S16_1m21>::process,
+             /*dx*/  internal::sepFilter3x3<internal::RowFilter3x3S16_m101,   internal::ColFilter3x3S16_1m21>::process,
+             /*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21,   internal::ColFilter3x3S16_1m21>::process,
+             /*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_1m21>::process},
+
+    /*dNy*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121,    internal::ColFilter3x3S16Generic>::process,
+             /*dx*/  internal::sepFilter3x3<internal::RowFilter3x3S16_m101,   internal::ColFilter3x3S16Generic>::process,
+             /*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21,   internal::ColFilter3x3S16Generic>::process,
+             /*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16Generic>::process}
+    };
+
+    quickFilters[colFilter][rowFilter](size, srcBase, srcStride, dstBase, dstStride,
+                                       xw, yw, border, borderValue, borderMargin);
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)xw;
+    (void)yw;
+    (void)borderValue;
+#endif
+}
+
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/separable_filter.hpp
+++ b/3rdparty/carotene/src/separable_filter.hpp
--- a/3rdparty/carotene/src/sobel.cpp
+++ b/3rdparty/carotene/src/sobel.cpp
@ -0,0 +1,317 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include <vector>
+
+#include "common.hpp"
+
+namespace CAROTENE_NS {
+
+bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border,
+                         s32 dx, s32 dy, Margin borderMargin)
+{
+    return dx < 3 && dx >= 0 &&
+           dy < 3 && dy >= 0 &&
+           (dx + dy) > 0 &&
+           isSeparableFilter3x3Supported(size, border, dx, dy, borderMargin);
+}
+
+void Sobel3x3(const Size2D &size,
+              const u8 * srcBase, ptrdiff_t srcStride,
+              s16 * dstBase, ptrdiff_t dstStride,
+              s32 dx, s32 dy,
+              BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
+{
+    internal::assertSupportedConfiguration(isSobel3x3Supported(size, borderType, dx, dy, borderMargin));
+#ifdef CAROTENE_NEON
+    SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
+                       dx, dy, 0, 0,
+                       borderType, borderValue, borderMargin);
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border,
+                            s32 dx, s32 dy)
+{
+    return isSupportedConfiguration() &&
+           dx < 3 && dx >= 0 &&
+           dy < 3 && dy >= 0 &&
+           (dx + dy) > 0 &&
+           size.width >= 4 && size.height >= 2 &&
+           (border == BORDER_MODE_CONSTANT   ||
+            border == BORDER_MODE_REFLECT    ||
+            border == BORDER_MODE_REFLECT101 ||
+            border == BORDER_MODE_REPLICATE   );
+}
+
+void Sobel3x3(const Size2D &size,
+              const f32 * srcBase, ptrdiff_t srcStride,
+              f32 * dstBase, ptrdiff_t dstStride,
+              s32 dx, s32 dy,
+              BORDER_MODE borderType, f32 borderValue)
+{
+    internal::assertSupportedConfiguration(isSobel3x3f32Supported(size, borderType, dx, dy));
+#ifdef CAROTENE_NEON
+    std::vector<f32> _tmp;
+    f32 *tmp = 0;
+    if (borderType == BORDER_MODE_CONSTANT)
+    {
+        _tmp.assign(size.width + 2, borderValue);
+        tmp = &_tmp[1];
+    }
+
+    ptrdiff_t delta = (ptrdiff_t)((size.width + 2 + 31) & -32);//align size
+    std::vector<f32> _tempBuf((delta << 1) + 64);
+    f32 *trow0 = internal::alignPtr(&_tempBuf[1], 32), *trow1 = internal::alignPtr(trow0 + delta, 32);
+
+    for( size_t y = 0; y < size.height; y++ )
+    {
+        const f32* srow0;
+        const f32* srow1 = internal::getRowPtr(srcBase, srcStride, y);
+        const f32* srow2;
+        f32* drow = internal::getRowPtr(dstBase, dstStride, y > 0 ? y-1 : 0);
+        f32* drow1 = internal::getRowPtr(dstBase, dstStride, y);
+        if (borderType == BORDER_MODE_REFLECT101) {
+            srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
+            srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
+        } else  if (borderType == BORDER_MODE_CONSTANT) {
+            srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
+            srow2 =  y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
+        } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
+            srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
+            srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
+        }
+
+        float32x4_t tprev = vmovq_n_f32(0.f);
+        float32x4_t tcurr = vmovq_n_f32(0.f);
+        float32x4_t tnext = vmovq_n_f32(0.f);
+        float32x4_t t0, t1, t2;
+        // do vertical convolution
+        size_t x = 0, bcolsn = y + 2 < size.height ? size.width : (size.width - 4);
+        for( ; x <= bcolsn; x += 4 )
+        {
+            internal::prefetch(srow0 + x);
+            internal::prefetch(srow1 + x);
+            internal::prefetch(srow2 + x);
+
+            float32x4_t x0 = vld1q_f32(srow0 + x);
+            float32x4_t x1 = vld1q_f32(srow1 + x);
+            float32x4_t x2 = vld1q_f32(srow2 + x);
+
+            tprev = tcurr;
+            tcurr = tnext;
+            if(!dy)
+            {
+                tnext = vaddq_f32(vaddq_f32(vaddq_f32(x1, x1), x2), x0);
+            }
+            else if(dy == 2)
+            {
+                tnext = vsubq_f32(vsubq_f32(x2, x1), vsubq_f32(x1, x0));
+            }
+            else
+            {
+                tnext = vsubq_f32(x2, x0);
+            }
+
+            if(!x) {
+                tcurr = tnext;
+                // make border
+                if (borderType == BORDER_MODE_CONSTANT)
+                {
+                    tcurr = vsetq_lane_f32(borderValue,tcurr, 3);
+                }
+                else if (borderType == BORDER_MODE_REFLECT101)
+                {
+                    tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 1),tcurr, 3);
+                }
+                else // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
+                {
+                    tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 0),tcurr, 3);
+                }
+                continue;
+            }
+
+            internal::prefetch(trow0 + x);
+            internal::prefetch(trow1 + x);
+
+            t0 = vextq_f32(tprev, tcurr, 3);
+            t1 = tcurr;
+            t2 = vextq_f32(tcurr, tnext, 1);
+            if(!dx)
+            {
+                t0 = vaddq_f32(t0, vaddq_f32(vaddq_f32(t1, t1), t2));
+            }
+            else if(dx == 2)
+            {
+                t0 = vsubq_f32(vsubq_f32(t2, t1), vsubq_f32(t1, t0));
+            }
+            else
+            {
+                t0 = vsubq_f32(t2, t0);
+            }
+
+            if(!(y%2))
+            {
+                vst1q_f32(trow0 + x - 4, t0);
+            }
+            else
+            {
+                vst1q_f32(trow1 + x - 4, t0);
+            }
+        }
+        x -= 4;
+        if(x == size.width){
+            x--;
+        }
+        f32 prevx = 0, rowx = 0, nextx = 0;
+        if(!dy)
+        {
+            prevx = x > 0 ? srow2[x-1] + 2*srow1[x-1] + srow0[x-1] :
+                    (borderType == BORDER_MODE_REFLECT101 ? srow2[1] + 2*srow1[1] + srow0[1] :
+                    (borderType == BORDER_MODE_CONSTANT   ? 4*borderValue :
+                                                            srow2[0] + 2*srow1[0] + srow0[0]) );
+            rowx  = srow2[x] + 2*srow1[x] + srow0[x];
+        }
+        else if(dy == 2)
+        {
+            prevx = x > 0 ? srow2[x-1] - 2*srow1[x-1] + srow0[x-1] :
+                    (borderType == BORDER_MODE_REFLECT101 ? srow2[1] - 2*srow1[1] + srow0[1] :
+                    (borderType == BORDER_MODE_CONSTANT   ? 0.f :
+                                                            srow2[0] - 2*srow1[0] + srow0[0]) );
+            rowx  = srow2[x] - 2*srow1[x] + srow0[x];
+        }
+        else
+        {
+            prevx = x > 0 ? srow2[x-1] - srow0[x-1] :
+                    (borderType == BORDER_MODE_REFLECT101 ? srow2[1] - srow0[1] :
+                    (borderType == BORDER_MODE_CONSTANT   ? 0.f :
+                                                            srow2[0] - srow0[0]) );
+            rowx  = srow2[x] - srow0[x];
+        }
+
+        for( ; x < size.width; x++ )
+        {
+            if(x+1 == size.width) {
+                // make border
+                if (borderType == BORDER_MODE_CONSTANT)
+                {
+                    if(!dy) {
+                        nextx = 4*borderValue;
+                    } else {
+                        nextx = 0.f;
+                    }
+                } else if (borderType == BORDER_MODE_REFLECT101)
+                {
+                    if(!dy) {
+                        nextx = srow2[x-1] + 2*srow1[x-1] + srow0[x-1];
+                    } else if(dy == 2) {
+                        nextx = srow2[x-1] - 2*srow1[x-1] + srow0[x-1];
+                    } else {
+                        nextx = srow2[x-1] - srow0[x-1];
+                    }
+                } else {
+                    if(!dy) {
+                        nextx = srow2[x] + 2*srow1[x] + srow0[x];
+                    } else if(dy == 2) {
+                        nextx = srow2[x] - 2*srow1[x] + srow0[x];
+                    } else {
+                        nextx = srow2[x] - srow0[x];
+                    }
+                }
+            } else {
+                if(!dy) {
+                    nextx = srow2[x+1] + 2*srow1[x+1] + srow0[x+1];
+                } else if(dy == 2) {
+                    nextx = srow2[x+1] - 2*srow1[x+1] + srow0[x+1];
+                } else {
+                    nextx = srow2[x+1] - srow0[x+1];
+                }
+            }
+            f32 res;
+            if(dx==1) {
+                res = nextx - prevx;
+            } else if(!dx) {
+                res = prevx + 2*rowx + nextx;
+            } else {
+                res = prevx - 2*rowx + nextx;
+            }
+            if(!(y%2)) {
+                *(trow0+x) = res;
+            } else {
+                *(trow1+x) = res;
+            }
+            prevx = rowx;
+            rowx = nextx;
+        }
+
+        if(y>0) {
+            for(size_t x1 = 0; x1 < size.width; x1++ )
+            {
+                if(y%2)
+                    *(drow + x1) = trow0[x1];
+                else
+                    *(drow + x1) = trow1[x1];
+            }
+        }
+        if(y == size.height-1) {
+            for(size_t x1 = 0; x1 < size.width; x1++ )
+            {
+                if(!(y%2))
+                    *(drow1 + x1) = trow0[x1];
+                else
+                    *(drow1 + x1) = trow1[x1];
+            }
+        }
+    }
+#else
+    (void)srcBase;
+    (void)srcStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/sub.cpp
+++ b/3rdparty/carotene/src/sub.cpp
@ -0,0 +1,621 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+#ifdef CAROTENE_NEON
+
+namespace {
+
+template <typename T, typename WT>
+struct SubWrap
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vsubq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vsub(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = (T)((WT)src0[0] - (WT)src1[0]);
+    }
+};
+
+template <typename T, typename WT>
+struct SubSaturate
+{
+    typedef T type;
+
+    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
+                     const typename internal::VecTraits<T>::vec128 & v_src1,
+                     typename internal::VecTraits<T>::vec128 & v_dst) const
+    {
+        v_dst = internal::vqsubq(v_src0, v_src1);
+    }
+
+    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
+                     const typename internal::VecTraits<T>::vec64 & v_src1,
+                     typename internal::VecTraits<T>::vec64 & v_dst) const
+    {
+        v_dst = internal::vqsub(v_src0, v_src1);
+    }
+
+    void operator() (const T * src0, const T * src1, T * dst) const
+    {
+        dst[0] = internal::saturate_cast<T>((WT)src0[0] - (WT)src1[0]);
+    }
+};
+
+} // namespace
+
+#endif
+
+void sub(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         u8 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<u8, s16>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<u8, s16>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        u16 * dstu16 = internal::getRowPtr((u16 *)dstBase, dstStride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+            uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
+            uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
+            vst1q_u16(dstu16 + j, vsubl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
+            vst1q_u16(dstu16 + j + 8, vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
+            vst1q_u16(dstu16 + j + 16, vsubl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
+            vst1q_u16(dstu16 + j + 24, vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src0 = vld1_u8(src0 + j);
+            uint8x8_t v_src1 = vld1_u8(src1 + j);
+            vst1q_u16(dstu16 + j, vsubl_u8(v_src0, v_src1));
+        }
+
+        for (; j < size.width; j++)
+            dst[j] = (s16)src0[j] - (s16)src1[j];
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         f32 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        for (; j < roiw32; j += 32)
+        {
+            internal::prefetch(src0 + j);
+            internal::prefetch(src1 + j);
+            uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
+            uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
+            int16x8_t vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src00),  vget_low_u8(v_src10)));
+            int16x8_t vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
+
+            vst1q_f32(dst + j +  0, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vsl) )));
+            vst1q_f32(dst + j +  4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
+            vst1q_f32(dst + j +  8, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vsh) )));
+            vst1q_f32(dst + j + 12, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
+
+            vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src01),  vget_low_u8(v_src11)));
+            vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
+
+            vst1q_f32(dst + j + 16, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vsl) )));
+            vst1q_f32(dst + j + 20, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
+            vst1q_f32(dst + j + 24, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vsh) )));
+            vst1q_f32(dst + j + 28, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
+        }
+        for (; j < roiw8; j += 8)
+        {
+            uint8x8_t v_src0 = vld1_u8(src0 + j);
+            uint8x8_t v_src1 = vld1_u8(src1 + j);
+
+            int16x8_t vs = vreinterpretq_s16_u16(vsubl_u8(v_src0, v_src1));
+            vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16(  vget_low_s16(vs) )));
+            vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vs) )));
+        }
+        for(; j < size.width; j++)
+            dst[j] = (f32)src0[j] - (f32)src1[j];
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u8 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (policy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+                int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
+                int16x8_t v_src1 = vld1q_s16(src1 + j);
+                int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
+        }
+        else
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                uint8x16_t v_src0 = vld1q_u8(src0 + j);
+                int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
+                int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
+                int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
+                int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
+                int16x8_t v_src1 = vld1q_s16(src1 + j);
+                int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const u8 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
+    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
+        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
+        s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
+        size_t j = 0;
+
+        if (policy == CONVERT_POLICY_SATURATE)
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
+                uint8x16_t v_src1 = vld1q_u8(src1 + j);
+                int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
+                int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
+                int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vld1q_s16(src0 + j);
+                int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
+                int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
+        }
+        else
+        {
+            for (; j < roiw16; j += 16)
+            {
+                internal::prefetch(src0 + j);
+                internal::prefetch(src1 + j);
+                int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
+                uint8x16_t v_src1 = vld1q_u8(src1 + j);
+                int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
+                int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
+                int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
+                int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
+                vst1q_s16(dst + j, v_dst0);
+                vst1q_s16(dst + j + 8, v_dst1);
+            }
+            for (; j < roiw8; j += 8)
+            {
+                int16x8_t v_src0 = vld1q_s16(src0 + j);
+                int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
+                int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
+                vst1q_s16(dst + j, v_dst);
+            }
+
+            for (; j < size.width; j++)
+                dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
+        }
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const s8 * src0Base, ptrdiff_t src0Stride,
+         const s8 * src1Base, ptrdiff_t src1Stride,
+         s8 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<s8, s16>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<s8, s16>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const s16 * src0Base, ptrdiff_t src0Stride,
+         const s16 * src1Base, ptrdiff_t src1Stride,
+         s16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<s16, s32>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<s16, s32>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u16 * src0Base, ptrdiff_t src0Stride,
+         const u16 * src1Base, ptrdiff_t src1Stride,
+         u16 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<u16, s32>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<u16, s32>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const s32 * src0Base, ptrdiff_t src0Stride,
+         const s32 * src1Base, ptrdiff_t src1Stride,
+         s32 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<s32, s64>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<s32, s64>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const u32 * src0Base, ptrdiff_t src0Stride,
+         const u32 * src1Base, ptrdiff_t src1Stride,
+         u32 *dstBase, ptrdiff_t dstStride,
+         CONVERT_POLICY policy)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    if (policy == CONVERT_POLICY_SATURATE)
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubSaturate<u32, s64>());
+    }
+    else
+    {
+        internal::vtransform(size,
+                             src0Base, src0Stride,
+                             src1Base, src1Stride,
+                             dstBase, dstStride,
+                             SubWrap<u32, s64>());
+    }
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)policy;
+#endif
+}
+
+void sub(const Size2D &size,
+         const f32 * src0Base, ptrdiff_t src0Stride,
+         const f32 * src1Base, ptrdiff_t src1Stride,
+         f32 *dstBase, ptrdiff_t dstStride)
+{
+    internal::assertSupportedConfiguration();
+#ifdef CAROTENE_NEON
+    internal::vtransform(size,
+                         src0Base, src0Stride,
+                         src1Base, src1Stride,
+                         dstBase, dstStride,
+                         SubWrap<f32, f32>());
+#else
+    (void)size;
+    (void)src0Base;
+    (void)src0Stride;
+    (void)src1Base;
+    (void)src1Stride;
+    (void)dstBase;
+    (void)dstStride;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/sum.cpp
+++ b/3rdparty/carotene/src/sum.cpp
@ -0,0 +1,385 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include "vtransform.hpp"
+
+namespace CAROTENE_NS {
+
+bool isSumSupported(u32 channels)
+{
+    return (channels && channels < 5);
+}
+
+void sum(const Size2D &_size,
+         const u8 * srcBase, ptrdiff_t srcStride,
+         u32 * sumdst, u32 channels)
+{
+    internal::assertSupportedConfiguration(isSumSupported(channels));
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    const ptrdiff_t width = size.width * channels;
+
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
+        ptrdiff_t i = 0;
+
+        if (channels == 3)
+        {
+            uint32x4_t vs1231 = vdupq_n_u32(0);
+            uint32x4_t vs3123 = vdupq_n_u32(0);
+            uint32x4_t vs2312 = vdupq_n_u32(0);
+            for (; i <= width - 257*8*3; i += 257*8*3, src += 257*8*3)
+            {
+                uint16x8_t s1 = vmovl_u8(vld1_u8(src +  0));
+                uint16x8_t s2 = vmovl_u8(vld1_u8(src +  8));
+                uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
+
+                for (ptrdiff_t j = 8*3; j < 257*8*3; j+= 8*3)
+                {
+                    internal::prefetch(src + j + 24);
+                    s1 = vaddw_u8(s1, vld1_u8(src + j +  0));
+                    s2 = vaddw_u8(s2, vld1_u8(src + j +  8));
+                    s3 = vaddw_u8(s3, vld1_u8(src + j + 16));
+                }
+
+                vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
+                vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
+                vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
+            }
+            if (i <= width - 8*3)
+            {
+                uint16x8_t s1 = vmovl_u8(vld1_u8(src +  0));
+                uint16x8_t s2 = vmovl_u8(vld1_u8(src +  8));
+                uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
+
+                for (i += 8*3, src += 8*3; i <= width - 8*3; i += 8*3, src += 8*3)
+                {
+                    internal::prefetch(src + 24);
+                    s1 = vaddw_u8(s1, vld1_u8(src +  0));
+                    s2 = vaddw_u8(s2, vld1_u8(src +  8));
+                    s3 = vaddw_u8(s3, vld1_u8(src + 16));
+                }
+
+                vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
+                vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
+                vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
+            }
+
+            u32 sum[12];
+            vst1q_u32(sum+0, vs1231);
+            vst1q_u32(sum+4, vs2312);
+            vst1q_u32(sum+8, vs3123);
+
+            for (; i < width; i += 3, src += 3)
+            {
+                sumdst[0] += src[0];
+                sumdst[1] += src[1];
+                sumdst[2] += src[2];
+            }
+
+            sumdst[0] += sum[0] + sum[3] + sum[6] + sum[9];
+            sumdst[1] += sum[1] + sum[4] + sum[7] + sum[10];
+            sumdst[2] += sum[2] + sum[5] + sum[8] + sum[11];
+        }
+        else
+        {
+            uint32x4_t vs = vdupq_n_u32(0);
+            for (; i <= width - 257*8; i += 257*8, src += 257 * 8)
+            {
+                uint16x8_t s1 = vmovl_u8(vld1_u8(src));
+
+                for (int j = 8; j < 257 * 8; j += 8)
+                {
+                    internal::prefetch(src + j);
+                    s1 = vaddw_u8(s1, vld1_u8(src + j));
+                }
+
+                vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
+            }
+            if (i < width - 7)
+            {
+                uint16x8_t s1 = vmovl_u8(vld1_u8(src));
+
+                for(i+=8,src+=8; i < width-7; i+=8,src+=8)
+                {
+                    internal::prefetch(src);
+                    s1 = vaddw_u8(s1, vld1_u8(src));
+                }
+                vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
+            }
+
+            if (channels == 1)
+            {
+                uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
+                uint32x2_t vs1 = vreinterpret_u32_u64(vpaddl_u32(vs2));
+
+                u32 s0 = vget_lane_u32(vs1, 0);
+                for(; i < width; ++i,++src)
+                    s0 += src[0];
+                sumdst[0] += s0;
+            }
+            else if (channels == 4)
+            {
+                vst1q_u32(sumdst, vqaddq_u32(vs, vld1q_u32(sumdst)));
+
+                for(; i < width; i+=4,src+=4)
+                {
+                    sumdst[0] += src[0];
+                    sumdst[1] += src[1];
+                    sumdst[2] += src[2];
+                    sumdst[3] += src[3];
+                }
+            }
+            else//if (channels == 2)
+            {
+                uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
+                vst1_u32(sumdst, vqadd_u32(vs2, vld1_u32(sumdst)));
+
+                for(; i < width; i+=2,src+=2)
+                {
+                    sumdst[0] += src[0];
+                    sumdst[1] += src[1];
+                }
+            }
+        }//channels != 3
+    }
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sumdst;
+    (void)channels;
+#endif
+}
+
+void sum(const Size2D &_size,
+         const f32 * srcBase, ptrdiff_t srcStride,
+         f64 * sumdst, u32 channels)
+{
+    internal::assertSupportedConfiguration(isSumSupported(channels));
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    const ptrdiff_t width = size.width * channels;
+
+    for(size_t k = 0; k < size.height; ++k)
+    {
+        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
+        ptrdiff_t i = 0;
+
+        if (channels == 3)
+        {
+            float32x4_t vs1231 = vdupq_n_f32(0);
+            float32x4_t vs2312 = vdupq_n_f32(0);
+            float32x4_t vs3123 = vdupq_n_f32(0);
+            for(; i <= width-12; i += 12)
+            {
+                internal::prefetch(src + i + 12);
+                vs1231 = vaddq_f32(vs1231, vld1q_f32(src + i + 0));
+                vs2312 = vaddq_f32(vs2312, vld1q_f32(src + i + 4));
+                vs3123 = vaddq_f32(vs3123, vld1q_f32(src + i + 8));
+            }
+
+            f32 s[12];
+            vst1q_f32(s + 0, vs1231);
+            vst1q_f32(s + 4, vs2312);
+            vst1q_f32(s + 8, vs3123);
+
+            sumdst[0] += s[0] + s[3] + s[6] + s[9];
+            sumdst[1] += s[1] + s[4] + s[7] + s[10];
+            sumdst[2] += s[2] + s[5] + s[8] + s[11];
+            for( ; i < width; i+=3)
+            {
+                sumdst[0] += src[i];
+                sumdst[1] += src[i+1];
+                sumdst[2] += src[i+2];
+            }
+        }
+        else
+        {
+            float32x4_t vs = vdupq_n_f32(0);
+            for(; i <= width-4; i += 4)
+            {
+                internal::prefetch(src + i);
+                vs = vaddq_f32(vs, vld1q_f32(src+i));
+            }
+
+            if (channels == 1)
+            {
+                float32x2_t vs2 = vpadd_f32(vget_low_f32(vs), vget_high_f32(vs));
+                f32 s[2];
+                vst1_f32(s, vs2);
+
+                sumdst[0] += s[0] + s[1];
+                for( ; i < width; i++)
+                    sumdst[0] += src[i];
+            }
+            else if (channels == 4)
+            {
+                f32 s[4];
+                vst1q_f32(s, vs);
+
+                sumdst[0] += s[0];
+                sumdst[1] += s[1];
+                sumdst[2] += s[2];
+                sumdst[3] += s[3];
+            }
+            else//if (channels == 2)
+            {
+                float32x2_t vs2 = vadd_f32(vget_low_f32(vs), vget_high_f32(vs));
+                f32 s[2];
+                vst1_f32(s, vs2);
+
+                sumdst[0] += s[0];
+                sumdst[1] += s[1];
+
+                if(i < width)
+                {
+                    sumdst[0] += src[i];
+                    sumdst[1] += src[i+1];
+                }
+            }
+        }//channels != 3
+    }
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sumdst;
+    (void)channels;
+#endif
+}
+
+bool isSqsumSupported(u32 channels)
+{
+    return (channels && ((4/channels)*channels == 4));
+}
+
+void sqsum(const Size2D &_size,
+           const u8 * srcBase, ptrdiff_t srcStride,
+           f64 * sumdst, f64 * sqsumdst, u32 channels)
+{
+    internal::assertSupportedConfiguration(isSqsumSupported(channels));
+#ifdef CAROTENE_NEON
+    Size2D size(_size);
+    if (srcStride == (ptrdiff_t)(size.width*channels))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+    const size_t width = size.width * channels;
+
+    size_t blockSize0 = 1 << 23;
+    size_t roiw8 = width & ~7;
+
+    uint32x4_t v_zero = vdupq_n_u32(0u);
+
+    for (size_t i = 0; i < size.height; ++i)
+    {
+        const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
+        size_t j = 0u;
+
+        while (j < roiw8)
+        {
+            size_t blockSize = std::min(roiw8 - j, blockSize0) + j;
+            uint32x4_t v_sum = v_zero;
+            uint32x4_t v_sqsum = v_zero;
+
+            for ( ; j < blockSize ; j += 8, src += 8)
+            {
+                internal::prefetch(src);
+                uint8x8_t v_src0 = vld1_u8(src);
+
+                uint16x8_t v_src = vmovl_u8(v_src0);
+                uint16x4_t v_srclo = vget_low_u16(v_src), v_srchi = vget_high_u16(v_src);
+                v_sum = vaddq_u32(v_sum, vaddl_u16(v_srclo, v_srchi));
+                v_sqsum = vmlal_u16(v_sqsum, v_srclo, v_srclo);
+                v_sqsum = vmlal_u16(v_sqsum, v_srchi, v_srchi);
+            }
+
+            u32 arsum[8];
+            vst1q_u32(arsum, v_sum);
+            vst1q_u32(arsum + 4, v_sqsum);
+
+            sumdst[0] += (f64)arsum[0];
+            sumdst[1 % channels] += (f64)arsum[1];
+            sumdst[2 % channels] += (f64)arsum[2];
+            sumdst[3 % channels] += (f64)arsum[3];
+            sqsumdst[0] += (f64)arsum[4];
+            sqsumdst[1 % channels] += (f64)arsum[5];
+            sqsumdst[2 % channels] += (f64)arsum[6];
+            sqsumdst[3 % channels] += (f64)arsum[7];
+        }
+        // collect a few last elements in the current row
+        // it's ok to process channels elements per step
+        // since we could handle 1,2 or 4 channels
+        // we always have channels-fold amount of elements remaining
+        for ( ; j < width; j+=channels, src+=channels)
+        {
+            for (u32 kk = 0; kk < channels; kk++)
+            {
+                u32 srcval = src[kk];
+                sumdst[kk] += srcval;
+                sqsumdst[kk] += srcval * srcval;
+            }
+        }
+    }
+#else
+    (void)_size;
+    (void)srcBase;
+    (void)srcStride;
+    (void)sumdst;
+    (void)sqsumdst;
+    (void)channels;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/template_matching.cpp
+++ b/3rdparty/carotene/src/template_matching.cpp
@ -0,0 +1,241 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2013-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "common.hpp"
+
+#include <vector>
+#include <cstring>
+
+namespace CAROTENE_NS {
+
+#define ENABLE4LINESMATCHING false  //Disabled since overall time for simultaneous 4 lines matching is greater than
+                                    //time for simultaneous 2 lines matching for the same amount of data
+
+bool isMatchTemplateSupported(const Size2D &tmplSize)
+{
+    return isSupportedConfiguration() &&
+           tmplSize.width >= 8 && // Actually the function could process even shorter templates
+                                  // but there will be no NEON optimization in this case
+           (tmplSize.width * tmplSize.height) <= 256;
+}
+
+void matchTemplate(const Size2D &srcSize,
+                   const u8 * srcBase, ptrdiff_t srcStride,
+                   const Size2D &tmplSize,
+                   const u8 * tmplBase, ptrdiff_t tmplStride,
+                   f32 * dstBase, ptrdiff_t dstStride,
+                   bool normalize)
+{
+    internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize));
+#ifdef CAROTENE_NEON
+    const size_t tmplW = tmplSize.width;
+    const size_t tmplH = tmplSize.height;
+    const size_t dstW  = srcSize.width  - tmplSize.width  + 1;
+    const size_t dstH  = srcSize.height - tmplSize.height + 1;
+
+    //template correlation part
+    {
+#if ENABLE4LINESMATCHING
+        const size_t dstroiw4 = dstW & ~3u;
+#endif
+        const size_t dstroiw2 = dstW & ~1u;
+        const size_t tmplroiw = tmplW & ~7u;
+        const size_t dstride = dstStride >> 2;
+
+        f32 *corr = dstBase;
+        const u8  *imgrrow = srcBase;
+        for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride)
+        {
+            size_t c = 0;
+#if ENABLE4LINESMATCHING
+            for(; c < dstroiw4; c+=4)
+            {
+                u32 dot[4] = {0, 0, 0, 0};
+                uint32x4_t vdot0 = vmovq_n_u32(0);
+                uint32x4_t vdot1 = vmovq_n_u32(0);
+                uint32x4_t vdot2 = vmovq_n_u32(0);
+                uint32x4_t vdot3 = vmovq_n_u32(0);
+
+                const u8  *img = imgrrow;
+                const u8 *tmpl = tmplBase;
+                for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
+                {
+                    size_t j = 0;
+                    for(; j < tmplroiw; j+=8)
+                    {
+                        uint8x8_t vtmpl = vld1_u8(tmpl + j);
+
+                        uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
+                        uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
+                        uint8x8_t vimg2 = vld1_u8(img + j + c + 2);
+                        uint8x8_t vimg3 = vld1_u8(img + j + c + 3);
+
+                        uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
+                        uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
+                        uint16x8_t vd2 = vmull_u8(vtmpl, vimg2);
+                        uint16x8_t vd3 = vmull_u8(vtmpl, vimg3);
+
+                        vdot0 = vpadalq_u16(vdot0, vd0);
+                        vdot1 = vpadalq_u16(vdot1, vd1);
+                        vdot2 = vpadalq_u16(vdot2, vd2);
+                        vdot3 = vpadalq_u16(vdot3, vd3);
+                    }
+                    for(; j < tmplW; ++j)
+                    {
+                        dot[0] += tmpl[j] * img[j + c + 0];
+                        dot[1] += tmpl[j] * img[j + c + 1];
+                        dot[2] += tmpl[j] * img[j + c + 2];
+                        dot[3] += tmpl[j] * img[j + c + 3];
+                    }
+                }
+                uint32x4_t vdotx   = vld1q_u32(dot);
+                uint32x2_t vdot_0  = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
+                uint32x2_t vdot_1  = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
+                uint32x2_t vdot_2  = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2));
+                uint32x2_t vdot_3  = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3));
+                uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1);
+                uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3);
+
+                vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23))));
+            }
+#endif
+
+            for(; c < dstroiw2; c+=2)
+            {
+                u32 dot[2] = {0, 0};
+                uint32x4_t vdot0 = vmovq_n_u32(0);
+                uint32x4_t vdot1 = vmovq_n_u32(0);
+                const u8  *img = imgrrow;
+                const u8 *tmpl = tmplBase;
+                for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
+                {
+                    size_t j = 0;
+                    for(; j < tmplroiw; j+=8)
+                    {
+                        uint8x8_t vtmpl = vld1_u8(tmpl + j);
+
+                        uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
+                        uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
+
+                        uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
+                        uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
+
+                        vdot0 = vpadalq_u16(vdot0, vd0);
+                        vdot1 = vpadalq_u16(vdot1, vd1);
+                    }
+                    for(; j < tmplW; ++j)
+                    {
+                        dot[0] += tmpl[j] * img[j + c + 0];
+                        dot[1] += tmpl[j] * img[j + c + 1];
+                    }
+                }
+                uint32x2_t vdotx  = vld1_u32(dot);
+                uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
+                uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
+                uint32x2_t vdot_  = vpadd_u32(vdot_0, vdot_1);
+                vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_)));
+            }
+
+            for(; c < dstW; ++c)
+            {
+                u32 dot = 0;
+                uint32x4_t vdot = vmovq_n_u32(0);
+                const u8  *img = imgrrow;
+                const u8 *tmpl = tmplBase;
+                for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
+                {
+                    size_t j = 0;
+                    for(; j < tmplroiw; j+=8)
+                    {
+                        uint8x8_t vtmpl = vld1_u8(tmpl + j);
+                        uint8x8_t vimg  = vld1_u8(img + j + c);
+                        uint16x8_t vd   = vmull_u8(vtmpl, vimg);
+                        vdot = vpadalq_u16(vdot, vd);
+                    }
+                    for(; j < tmplW; ++j)
+                        dot += tmpl[j] * img[j + c];
+                }
+                u32 wdot[2];
+                vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
+                dot += wdot[0] + wdot[1];
+                corr[c] = (f32)dot;
+            }
+        }
+    }
+
+    if(normalize)
+    {
+        f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));
+
+        size_t iw = srcSize.width+1;
+        size_t ih = srcSize.height+1;
+        std::vector<f64> _sqsum(iw*ih);
+        f64 *sqsum = &_sqsum[0];
+        memset(sqsum, 0, iw*sizeof(f64));
+        for(size_t i = 1; i < ih; ++i)
+            sqsum[iw*i] = 0.;
+        sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64));
+
+        for(size_t i = 0; i < dstH; ++i)
+        {
+            f32 *result = internal::getRowPtr(dstBase, dstStride, i);
+            for(size_t j = 0; j < dstW; ++j)
+            {
+                double s2 = sqsum[iw*i + j] +
+                            sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] -
+                            sqsum[iw*(i + tmplSize.height) + j] -
+                            sqsum[iw*i + j + tmplSize.width];
+
+                result[j] /= tn * std::sqrt(s2);
+            }
+        }
+    }
+#else
+    (void)srcSize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)tmplBase;
+    (void)tmplStride;
+    (void)dstBase;
+    (void)dstStride;
+    (void)normalize;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/threshold.cpp
+++ b/3rdparty/carotene/src/threshold.cpp
--- a/3rdparty/carotene/src/vtransform.hpp
+++ b/3rdparty/carotene/src/vtransform.hpp
@ -0,0 +1,689 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#ifndef CAROTENE_SRC_VTRANSFORM_HPP
+#define CAROTENE_SRC_VTRANSFORM_HPP
+
+#include "common.hpp"
+
+#include <carotene/types.hpp>
+
+#ifdef CAROTENE_NEON
+
+namespace CAROTENE_NS { namespace internal {
+
+////////////////////////////// Type Traits ///////////////////////
+
+template <typename T, int cn = 1>
+struct VecTraits;
+
+template <> struct VecTraits< u8, 1> { typedef  uint8x16_t vec128; typedef   uint8x8_t vec64; typedef VecTraits<  u8, 1> unsign; };
+template <> struct VecTraits< s8, 1> { typedef   int8x16_t vec128; typedef    int8x8_t vec64; typedef VecTraits<  u8, 1> unsign; };
+template <> struct VecTraits<u16, 1> { typedef  uint16x8_t vec128; typedef  uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
+template <> struct VecTraits<s16, 1> { typedef   int16x8_t vec128; typedef   int16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
+template <> struct VecTraits<s32, 1> { typedef   int32x4_t vec128; typedef   int32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
+template <> struct VecTraits<u32, 1> { typedef  uint32x4_t vec128; typedef  uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
+template <> struct VecTraits<s64, 1> { typedef   int64x2_t vec128; typedef   int64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
+template <> struct VecTraits<u64, 1> { typedef  uint64x2_t vec128; typedef  uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
+template <> struct VecTraits<f32, 1> { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
+
+template <> struct VecTraits< u8, 2> { typedef  uint8x16x2_t vec128; typedef   uint8x8x2_t vec64; typedef VecTraits<  u8, 2> unsign; };
+template <> struct VecTraits< s8, 2> { typedef   int8x16x2_t vec128; typedef    int8x8x2_t vec64; typedef VecTraits<  u8, 2> unsign; };
+template <> struct VecTraits<u16, 2> { typedef  uint16x8x2_t vec128; typedef  uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
+template <> struct VecTraits<s16, 2> { typedef   int16x8x2_t vec128; typedef   int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
+template <> struct VecTraits<s32, 2> { typedef   int32x4x2_t vec128; typedef   int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
+template <> struct VecTraits<u32, 2> { typedef  uint32x4x2_t vec128; typedef  uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
+template <> struct VecTraits<s64, 2> { typedef   int64x2x2_t vec128; typedef   int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<u64, 2> { typedef  uint64x2x2_t vec128; typedef  uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<f32, 2> { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
+
+template <> struct VecTraits< u8, 3> { typedef  uint8x16x3_t vec128; typedef   uint8x8x3_t vec64; typedef VecTraits<  u8, 3> unsign; };
+template <> struct VecTraits< s8, 3> { typedef   int8x16x3_t vec128; typedef    int8x8x3_t vec64; typedef VecTraits<  u8, 3> unsign; };
+template <> struct VecTraits<u16, 3> { typedef  uint16x8x3_t vec128; typedef  uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
+template <> struct VecTraits<s16, 3> { typedef   int16x8x3_t vec128; typedef   int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
+template <> struct VecTraits<s32, 3> { typedef   int32x4x3_t vec128; typedef   int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
+template <> struct VecTraits<u32, 3> { typedef  uint32x4x3_t vec128; typedef  uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
+template <> struct VecTraits<s64, 3> { typedef   int64x2x3_t vec128; typedef   int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<u64, 3> { typedef  uint64x2x3_t vec128; typedef  uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<f32, 3> { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
+
+template <> struct VecTraits< u8, 4> { typedef  uint8x16x4_t vec128; typedef   uint8x8x4_t vec64; typedef VecTraits<  u8, 3> unsign; };
+template <> struct VecTraits< s8, 4> { typedef   int8x16x4_t vec128; typedef    int8x8x4_t vec64; typedef VecTraits<  u8, 3> unsign; };
+template <> struct VecTraits<u16, 4> { typedef  uint16x8x4_t vec128; typedef  uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
+template <> struct VecTraits<s16, 4> { typedef   int16x8x4_t vec128; typedef   int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
+template <> struct VecTraits<s32, 4> { typedef   int32x4x4_t vec128; typedef   int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
+template <> struct VecTraits<u32, 4> { typedef  uint32x4x4_t vec128; typedef  uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
+template <> struct VecTraits<s64, 4> { typedef   int64x2x4_t vec128; typedef   int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<u64, 4> { typedef  uint64x2x4_t vec128; typedef  uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
+template <> struct VecTraits<f32, 4> { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
+
+////////////////////////////// vld1q ///////////////////////
+
+inline  uint8x16_t vld1q(const u8  * ptr) { return  vld1q_u8(ptr); }
+inline   int8x16_t vld1q(const s8  * ptr) { return  vld1q_s8(ptr); }
+inline  uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }
+inline   int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); }
+inline  uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); }
+inline   int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
+inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); }
+
+////////////////////////////// vld1 ///////////////////////
+
+inline   uint8x8_t vld1(const u8  * ptr) { return  vld1_u8(ptr); }
+inline    int8x8_t vld1(const s8  * ptr) { return  vld1_s8(ptr); }
+inline  uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); }
+inline   int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); }
+inline  uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); }
+inline   int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); }
+inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }
+
+////////////////////////////// vld2q ///////////////////////
+
+inline  uint8x16x2_t vld2q(const u8  * ptr) { return  vld2q_u8(ptr); }
+inline   int8x16x2_t vld2q(const s8  * ptr) { return  vld2q_s8(ptr); }
+inline  uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); }
+inline   int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); }
+inline  uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); }
+inline   int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); }
+inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); }
+
+////////////////////////////// vld2 ///////////////////////
+
+inline   uint8x8x2_t vld2(const u8  * ptr) { return  vld2_u8(ptr); }
+inline    int8x8x2_t vld2(const s8  * ptr) { return  vld2_s8(ptr); }
+inline  uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); }
+inline   int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); }
+inline  uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); }
+inline   int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); }
+inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); }
+
+////////////////////////////// vld3q ///////////////////////
+
+inline  uint8x16x3_t vld3q(const u8  * ptr) { return  vld3q_u8(ptr); }
+inline   int8x16x3_t vld3q(const s8  * ptr) { return  vld3q_s8(ptr); }
+inline  uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); }
+inline   int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); }
+inline  uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); }
+inline   int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); }
+inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); }
+
+////////////////////////////// vld3 ///////////////////////
+
+inline   uint8x8x3_t vld3(const u8  * ptr) { return  vld3_u8(ptr); }
+inline    int8x8x3_t vld3(const s8  * ptr) { return  vld3_s8(ptr); }
+inline  uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); }
+inline   int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); }
+inline  uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); }
+inline   int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); }
+inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); }
+
+////////////////////////////// vld4q ///////////////////////
+
+inline  uint8x16x4_t vld4q(const u8  * ptr) { return  vld4q_u8(ptr); }
+inline   int8x16x4_t vld4q(const s8  * ptr) { return  vld4q_s8(ptr); }
+inline  uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); }
+inline   int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); }
+inline  uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); }
+inline   int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); }
+inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); }
+
+////////////////////////////// vld4 ///////////////////////
+
+inline   uint8x8x4_t vld4(const u8  * ptr) { return  vld4_u8(ptr); }
+inline    int8x8x4_t vld4(const s8  * ptr) { return  vld4_s8(ptr); }
+inline  uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); }
+inline   int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); }
+inline  uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); }
+inline   int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); }
+inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); }
+
+////////////////////////////// vst1q ///////////////////////
+
+inline void vst1q(u8  * ptr, const uint8x16_t  & v) { return vst1q_u8(ptr,  v); }
+inline void vst1q(s8  * ptr, const int8x16_t   & v) { return vst1q_s8(ptr,  v); }
+inline void vst1q(u16 * ptr, const uint16x8_t  & v) { return vst1q_u16(ptr, v); }
+inline void vst1q(s16 * ptr, const int16x8_t   & v) { return vst1q_s16(ptr, v); }
+inline void vst1q(u32 * ptr, const uint32x4_t  & v) { return vst1q_u32(ptr, v); }
+inline void vst1q(s32 * ptr, const int32x4_t   & v) { return vst1q_s32(ptr, v); }
+inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }
+
+////////////////////////////// vst1 ///////////////////////
+
+inline void vst1(u8  * ptr, const uint8x8_t   & v) { return vst1_u8(ptr,  v); }
+inline void vst1(s8  * ptr, const int8x8_t    & v) { return vst1_s8(ptr,  v); }
+inline void vst1(u16 * ptr, const uint16x4_t  & v) { return vst1_u16(ptr, v); }
+inline void vst1(s16 * ptr, const int16x4_t   & v) { return vst1_s16(ptr, v); }
+inline void vst1(u32 * ptr, const uint32x2_t  & v) { return vst1_u32(ptr, v); }
+inline void vst1(s32 * ptr, const int32x2_t   & v) { return vst1_s32(ptr, v); }
+inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }
+
+////////////////////////////// vst2q ///////////////////////
+
+inline void vst2q(u8  * ptr, const uint8x16x2_t  & v) { return vst2q_u8(ptr,  v); }
+inline void vst2q(s8  * ptr, const int8x16x2_t   & v) { return vst2q_s8(ptr,  v); }
+inline void vst2q(u16 * ptr, const uint16x8x2_t  & v) { return vst2q_u16(ptr, v); }
+inline void vst2q(s16 * ptr, const int16x8x2_t   & v) { return vst2q_s16(ptr, v); }
+inline void vst2q(u32 * ptr, const uint32x4x2_t  & v) { return vst2q_u32(ptr, v); }
+inline void vst2q(s32 * ptr, const int32x4x2_t   & v) { return vst2q_s32(ptr, v); }
+inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); }
+
+////////////////////////////// vst2 ///////////////////////
+
+inline void vst2(u8  * ptr, const uint8x8x2_t   & v) { return vst2_u8(ptr,  v); }
+inline void vst2(s8  * ptr, const int8x8x2_t    & v) { return vst2_s8(ptr,  v); }
+inline void vst2(u16 * ptr, const uint16x4x2_t  & v) { return vst2_u16(ptr, v); }
+inline void vst2(s16 * ptr, const int16x4x2_t   & v) { return vst2_s16(ptr, v); }
+inline void vst2(u32 * ptr, const uint32x2x2_t  & v) { return vst2_u32(ptr, v); }
+inline void vst2(s32 * ptr, const int32x2x2_t   & v) { return vst2_s32(ptr, v); }
+inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); }
+
+////////////////////////////// vst3q ///////////////////////
+
+inline void vst3q(u8  * ptr, const uint8x16x3_t  & v) { return vst3q_u8(ptr,  v); }
+inline void vst3q(s8  * ptr, const int8x16x3_t   & v) { return vst3q_s8(ptr,  v); }
+inline void vst3q(u16 * ptr, const uint16x8x3_t  & v) { return vst3q_u16(ptr, v); }
+inline void vst3q(s16 * ptr, const int16x8x3_t   & v) { return vst3q_s16(ptr, v); }
+inline void vst3q(u32 * ptr, const uint32x4x3_t  & v) { return vst3q_u32(ptr, v); }
+inline void vst3q(s32 * ptr, const int32x4x3_t   & v) { return vst3q_s32(ptr, v); }
+inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); }
+
+////////////////////////////// vst3 ///////////////////////
+
+inline void vst3(u8  * ptr, const uint8x8x3_t   & v) { return vst3_u8(ptr,  v); }
+inline void vst3(s8  * ptr, const int8x8x3_t    & v) { return vst3_s8(ptr,  v); }
+inline void vst3(u16 * ptr, const uint16x4x3_t  & v) { return vst3_u16(ptr, v); }
+inline void vst3(s16 * ptr, const int16x4x3_t   & v) { return vst3_s16(ptr, v); }
+inline void vst3(u32 * ptr, const uint32x2x3_t  & v) { return vst3_u32(ptr, v); }
+inline void vst3(s32 * ptr, const int32x2x3_t   & v) { return vst3_s32(ptr, v); }
+inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); }
+
+////////////////////////////// vst4q ///////////////////////
+
+inline void vst4q(u8  * ptr, const uint8x16x4_t  & v) { return vst4q_u8(ptr,  v); }
+inline void vst4q(s8  * ptr, const int8x16x4_t   & v) { return vst4q_s8(ptr,  v); }
+inline void vst4q(u16 * ptr, const uint16x8x4_t  & v) { return vst4q_u16(ptr, v); }
+inline void vst4q(s16 * ptr, const int16x8x4_t   & v) { return vst4q_s16(ptr, v); }
+inline void vst4q(u32 * ptr, const uint32x4x4_t  & v) { return vst4q_u32(ptr, v); }
+inline void vst4q(s32 * ptr, const int32x4x4_t   & v) { return vst4q_s32(ptr, v); }
+inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); }
+
+////////////////////////////// vst4 ///////////////////////
+
+inline void vst4(u8  * ptr, const uint8x8x4_t   & v) { return vst4_u8(ptr,  v); }
+inline void vst4(s8  * ptr, const int8x8x4_t    & v) { return vst4_s8(ptr,  v); }
+inline void vst4(u16 * ptr, const uint16x4x4_t  & v) { return vst4_u16(ptr, v); }
+inline void vst4(s16 * ptr, const int16x4x4_t   & v) { return vst4_s16(ptr, v); }
+inline void vst4(u32 * ptr, const uint32x2x4_t  & v) { return vst4_u32(ptr, v); }
+inline void vst4(s32 * ptr, const int32x2x4_t   & v) { return vst4_s32(ptr, v); }
+inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); }
+
+////////////////////////////// vabdq ///////////////////////
+
+inline  uint8x16_t vabdq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vabdq_u8 (v0, v1); }
+inline   int8x16_t vabdq(const int8x16_t   & v0, const int8x16_t   & v1) { return vabdq_s8 (v0, v1); }
+inline  uint16x8_t vabdq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vabdq_u16(v0, v1); }
+inline   int16x8_t vabdq(const int16x8_t   & v0, const int16x8_t   & v1) { return vabdq_s16(v0, v1); }
+inline  uint32x4_t vabdq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vabdq_u32(v0, v1); }
+inline   int32x4_t vabdq(const int32x4_t   & v0, const int32x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); }
+
+////////////////////////////// vabd ///////////////////////
+
+inline   uint8x8_t vabd(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vabd_u8 (v0, v1); }
+inline    int8x8_t vabd(const int8x8_t    & v0, const int8x8_t    & v1) { return vabd_s8 (v0, v1); }
+inline  uint16x4_t vabd(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vabd_u16(v0, v1); }
+inline   int16x4_t vabd(const int16x4_t   & v0, const int16x4_t   & v1) { return vabd_s16(v0, v1); }
+inline  uint32x2_t vabd(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vabd_u32(v0, v1); }
+inline   int32x2_t vabd(const int32x2_t   & v0, const int32x2_t   & v1) { return vabd_s32(v0, v1); }
+inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); }
+
+////////////////////////////// vminq ///////////////////////
+
+inline  uint8x16_t vminq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vminq_u8 (v0, v1); }
+inline   int8x16_t vminq(const int8x16_t   & v0, const int8x16_t   & v1) { return vminq_s8 (v0, v1); }
+inline  uint16x8_t vminq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vminq_u16(v0, v1); }
+inline   int16x8_t vminq(const int16x8_t   & v0, const int16x8_t   & v1) { return vminq_s16(v0, v1); }
+inline  uint32x4_t vminq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vminq_u32(v0, v1); }
+inline   int32x4_t vminq(const int32x4_t   & v0, const int32x4_t   & v1) { return vminq_s32(v0, v1); }
+inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }
+
+////////////////////////////// vmin ///////////////////////
+
+inline   uint8x8_t vmin(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vmin_u8 (v0, v1); }
+inline    int8x8_t vmin(const int8x8_t    & v0, const int8x8_t    & v1) { return vmin_s8 (v0, v1); }
+inline  uint16x4_t vmin(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vmin_u16(v0, v1); }
+inline   int16x4_t vmin(const int16x4_t   & v0, const int16x4_t   & v1) { return vmin_s16(v0, v1); }
+inline  uint32x2_t vmin(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vmin_u32(v0, v1); }
+inline   int32x2_t vmin(const int32x2_t   & v0, const int32x2_t   & v1) { return vmin_s32(v0, v1); }
+inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); }
+
+////////////////////////////// vmaxq ///////////////////////
+
+inline  uint8x16_t vmaxq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vmaxq_u8 (v0, v1); }
+inline   int8x16_t vmaxq(const int8x16_t   & v0, const int8x16_t   & v1) { return vmaxq_s8 (v0, v1); }
+inline  uint16x8_t vmaxq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vmaxq_u16(v0, v1); }
+inline   int16x8_t vmaxq(const int16x8_t   & v0, const int16x8_t   & v1) { return vmaxq_s16(v0, v1); }
+inline  uint32x4_t vmaxq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vmaxq_u32(v0, v1); }
+inline   int32x4_t vmaxq(const int32x4_t   & v0, const int32x4_t   & v1) { return vmaxq_s32(v0, v1); }
+inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); }
+
+////////////////////////////// vmax ///////////////////////
+
+inline   uint8x8_t vmax(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vmax_u8 (v0, v1); }
+inline    int8x8_t vmax(const int8x8_t    & v0, const int8x8_t    & v1) { return vmax_s8 (v0, v1); }
+inline  uint16x4_t vmax(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vmax_u16(v0, v1); }
+inline   int16x4_t vmax(const int16x4_t   & v0, const int16x4_t   & v1) { return vmax_s16(v0, v1); }
+inline  uint32x2_t vmax(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vmax_u32(v0, v1); }
+inline   int32x2_t vmax(const int32x2_t   & v0, const int32x2_t   & v1) { return vmax_s32(v0, v1); }
+inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); }
+
+////////////////////////////// vdupq_n ///////////////////////
+
+inline  uint8x16_t vdupq_n(const u8  & val) { return  vdupq_n_u8(val); }
+inline   int8x16_t vdupq_n(const s8  & val) { return  vdupq_n_s8(val); }
+inline  uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
+inline   int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); }
+inline  uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); }
+inline   int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); }
+inline  uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); }
+inline   int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); }
+inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); }
+
+////////////////////////////// vdup_n ///////////////////////
+
+inline   uint8x8_t vdup_n(const u8  & val) { return  vdup_n_u8(val); }
+inline    int8x8_t vdup_n(const s8  & val) { return  vdup_n_s8(val); }
+inline  uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); }
+inline   int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); }
+inline  uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); }
+inline   int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); }
+inline  uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); }
+inline   int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); }
+inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); }
+
+////////////////////////////// vget_low ///////////////////////
+
+inline uint8x8_t   vget_low(const uint8x16_t  & v) { return vget_low_u8 (v); }
+inline int8x8_t    vget_low(const int8x16_t   & v) { return vget_low_s8 (v); }
+inline uint16x4_t  vget_low(const uint16x8_t  & v) { return vget_low_u16(v); }
+inline int16x4_t   vget_low(const int16x8_t   & v) { return vget_low_s16(v); }
+inline uint32x2_t  vget_low(const uint32x4_t  & v) { return vget_low_u32(v); }
+inline int32x2_t   vget_low(const int32x4_t   & v) { return vget_low_s32(v); }
+inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); }
+
+////////////////////////////// vget_high ///////////////////////
+
+inline uint8x8_t   vget_high(const uint8x16_t  & v) { return vget_high_u8 (v); }
+inline int8x8_t    vget_high(const int8x16_t   & v) { return vget_high_s8 (v); }
+inline uint16x4_t  vget_high(const uint16x8_t  & v) { return vget_high_u16(v); }
+inline int16x4_t   vget_high(const int16x8_t   & v) { return vget_high_s16(v); }
+inline uint32x2_t  vget_high(const uint32x4_t  & v) { return vget_high_u32(v); }
+inline int32x2_t   vget_high(const int32x4_t   & v) { return vget_high_s32(v); }
+inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); }
+
+////////////////////////////// vcombine ///////////////////////
+
+inline   uint8x16_t vcombine(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcombine_u8 (v0, v1); }
+inline    int8x16_t vcombine(const int8x8_t    & v0, const int8x8_t    & v1) { return vcombine_s8 (v0, v1); }
+inline  uint16x8_t  vcombine(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vcombine_u16(v0, v1); }
+inline   int16x8_t  vcombine(const int16x4_t   & v0, const int16x4_t   & v1) { return vcombine_s16(v0, v1); }
+inline  uint32x4_t  vcombine(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vcombine_u32(v0, v1); }
+inline   int32x4_t  vcombine(const int32x2_t   & v0, const int32x2_t   & v1) { return vcombine_s32(v0, v1); }
+inline float32x4_t  vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); }
+
+////////////////////////////// vaddq ///////////////////////
+
+inline  uint8x16_t vaddq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vaddq_u8 (v0, v1); }
+inline   int8x16_t vaddq(const int8x16_t   & v0, const int8x16_t   & v1) { return vaddq_s8 (v0, v1); }
+inline  uint16x8_t vaddq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vaddq_u16(v0, v1); }
+inline   int16x8_t vaddq(const int16x8_t   & v0, const int16x8_t   & v1) { return vaddq_s16(v0, v1); }
+inline  uint32x4_t vaddq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vaddq_u32(v0, v1); }
+inline   int32x4_t vaddq(const int32x4_t   & v0, const int32x4_t   & v1) { return vaddq_s32(v0, v1); }
+inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); }
+
+////////////////////////////// vadd ///////////////////////
+
+inline   uint8x8_t vadd(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vadd_u8 (v0, v1); }
+inline    int8x8_t vadd(const int8x8_t    & v0, const int8x8_t    & v1) { return vadd_s8 (v0, v1); }
+inline  uint16x4_t vadd(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vadd_u16(v0, v1); }
+inline   int16x4_t vadd(const int16x4_t   & v0, const int16x4_t   & v1) { return vadd_s16(v0, v1); }
+inline  uint32x2_t vadd(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vadd_u32(v0, v1); }
+inline   int32x2_t vadd(const int32x2_t   & v0, const int32x2_t   & v1) { return vadd_s32(v0, v1); }
+inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); }
+
+////////////////////////////// vqaddq ///////////////////////
+
+inline  uint8x16_t vqaddq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vqaddq_u8 (v0, v1); }
+inline   int8x16_t vqaddq(const int8x16_t   & v0, const int8x16_t   & v1) { return vqaddq_s8 (v0, v1); }
+inline  uint16x8_t vqaddq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vqaddq_u16(v0, v1); }
+inline   int16x8_t vqaddq(const int16x8_t   & v0, const int16x8_t   & v1) { return vqaddq_s16(v0, v1); }
+inline  uint32x4_t vqaddq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vqaddq_u32(v0, v1); }
+inline   int32x4_t vqaddq(const int32x4_t   & v0, const int32x4_t   & v1) { return vqaddq_s32(v0, v1); }
+
+////////////////////////////// vqadd ///////////////////////
+
+inline   uint8x8_t vqadd(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vqadd_u8 (v0, v1); }
+inline    int8x8_t vqadd(const int8x8_t    & v0, const int8x8_t    & v1) { return vqadd_s8 (v0, v1); }
+inline  uint16x4_t vqadd(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vqadd_u16(v0, v1); }
+inline   int16x4_t vqadd(const int16x4_t   & v0, const int16x4_t   & v1) { return vqadd_s16(v0, v1); }
+inline  uint32x2_t vqadd(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vqadd_u32(v0, v1); }
+inline   int32x2_t vqadd(const int32x2_t   & v0, const int32x2_t   & v1) { return vqadd_s32(v0, v1); }
+
+////////////////////////////// vsubq ///////////////////////
+
+inline  uint8x16_t vsubq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vsubq_u8 (v0, v1); }
+inline   int8x16_t vsubq(const int8x16_t   & v0, const int8x16_t   & v1) { return vsubq_s8 (v0, v1); }
+inline  uint16x8_t vsubq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vsubq_u16(v0, v1); }
+inline   int16x8_t vsubq(const int16x8_t   & v0, const int16x8_t   & v1) { return vsubq_s16(v0, v1); }
+inline  uint32x4_t vsubq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vsubq_u32(v0, v1); }
+inline   int32x4_t vsubq(const int32x4_t   & v0, const int32x4_t   & v1) { return vsubq_s32(v0, v1); }
+inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); }
+
+////////////////////////////// vsub ///////////////////////
+
+inline   uint8x8_t vsub(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vsub_u8 (v0, v1); }
+inline    int8x8_t vsub(const int8x8_t    & v0, const int8x8_t    & v1) { return vsub_s8 (v0, v1); }
+inline  uint16x4_t vsub(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vsub_u16(v0, v1); }
+inline   int16x4_t vsub(const int16x4_t   & v0, const int16x4_t   & v1) { return vsub_s16(v0, v1); }
+inline  uint32x2_t vsub(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vsub_u32(v0, v1); }
+inline   int32x2_t vsub(const int32x2_t   & v0, const int32x2_t   & v1) { return vsub_s32(v0, v1); }
+inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); }
+
+////////////////////////////// vqsubq ///////////////////////
+
+inline  uint8x16_t vqsubq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vqsubq_u8 (v0, v1); }
+inline   int8x16_t vqsubq(const int8x16_t   & v0, const int8x16_t   & v1) { return vqsubq_s8 (v0, v1); }
+inline  uint16x8_t vqsubq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vqsubq_u16(v0, v1); }
+inline   int16x8_t vqsubq(const int16x8_t   & v0, const int16x8_t   & v1) { return vqsubq_s16(v0, v1); }
+inline  uint32x4_t vqsubq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vqsubq_u32(v0, v1); }
+inline   int32x4_t vqsubq(const int32x4_t   & v0, const int32x4_t   & v1) { return vqsubq_s32(v0, v1); }
+inline  uint64x2_t vqsubq(const uint64x2_t  & v0, const uint64x2_t  & v1) { return vqsubq_u64(v0, v1); }
+inline   int64x2_t vqsubq(const int64x2_t   & v0, const int64x2_t   & v1) { return vqsubq_s64(v0, v1); }
+
+////////////////////////////// vqsub ///////////////////////
+
+inline   uint8x8_t vqsub(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vqsub_u8 (v0, v1); }
+inline    int8x8_t vqsub(const int8x8_t    & v0, const int8x8_t    & v1) { return vqsub_s8 (v0, v1); }
+inline  uint16x4_t vqsub(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vqsub_u16(v0, v1); }
+inline   int16x4_t vqsub(const int16x4_t   & v0, const int16x4_t   & v1) { return vqsub_s16(v0, v1); }
+inline  uint32x2_t vqsub(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vqsub_u32(v0, v1); }
+inline   int32x2_t vqsub(const int32x2_t   & v0, const int32x2_t   & v1) { return vqsub_s32(v0, v1); }
+inline  uint64x1_t vqsub(const uint64x1_t  & v0, const uint64x1_t  & v1) { return vqsub_u64(v0, v1); }
+inline   int64x1_t vqsub(const int64x1_t   & v0, const int64x1_t   & v1) { return vqsub_s64(v0, v1); }
+
+////////////////////////////// vmull ///////////////////////
+
+inline  uint16x8_t vmull(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vmull_u8 (v0, v1); }
+inline   int16x8_t vmull(const int8x8_t    & v0, const int8x8_t    & v1) { return vmull_s8 (v0, v1); }
+inline  uint32x4_t vmull(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vmull_u16(v0, v1); }
+inline   int32x4_t vmull(const int16x4_t   & v0, const int16x4_t   & v1) { return vmull_s16(v0, v1); }
+inline  uint64x2_t vmull(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vmull_u32(v0, v1); }
+inline   int64x2_t vmull(const int32x2_t   & v0, const int32x2_t   & v1) { return vmull_s32(v0, v1); }
+
+////////////////////////////// vrev64q ///////////////////////
+
+inline uint8x16_t  vrev64q(const uint8x16_t  & v) { return vrev64q_u8 (v); }
+inline int8x16_t   vrev64q(const int8x16_t   & v) { return vrev64q_s8 (v); }
+inline uint16x8_t  vrev64q(const uint16x8_t  & v) { return vrev64q_u16(v); }
+inline int16x8_t   vrev64q(const int16x8_t   & v) { return vrev64q_s16(v); }
+inline uint32x4_t  vrev64q(const uint32x4_t  & v) { return vrev64q_u32(v); }
+inline int32x4_t   vrev64q(const int32x4_t   & v) { return vrev64q_s32(v); }
+inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); }
+
+////////////////////////////// vrev64 ///////////////////////
+
+inline uint8x8_t   vrev64(const uint8x8_t   & v) { return vrev64_u8 (v); }
+inline int8x8_t    vrev64(const int8x8_t    & v) { return vrev64_s8 (v); }
+inline uint16x4_t  vrev64(const uint16x4_t  & v) { return vrev64_u16(v); }
+inline int16x4_t   vrev64(const int16x4_t   & v) { return vrev64_s16(v); }
+inline uint32x2_t  vrev64(const uint32x2_t  & v) { return vrev64_u32(v); }
+inline int32x2_t   vrev64(const int32x2_t   & v) { return vrev64_s32(v); }
+inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); }
+
+////////////////////////////// vceqq ///////////////////////
+
+inline  uint8x16_t vceqq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vceqq_u8 (v0, v1); }
+inline  uint8x16_t vceqq(const int8x16_t   & v0, const int8x16_t   & v1) { return vceqq_s8 (v0, v1); }
+inline  uint16x8_t vceqq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vceqq_u16(v0, v1); }
+inline  uint16x8_t vceqq(const int16x8_t   & v0, const int16x8_t   & v1) { return vceqq_s16(v0, v1); }
+inline  uint32x4_t vceqq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vceqq_u32(v0, v1); }
+inline  uint32x4_t vceqq(const int32x4_t   & v0, const int32x4_t   & v1) { return vceqq_s32(v0, v1); }
+inline  uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }
+
+////////////////////////////// vceq ///////////////////////
+
+inline   uint8x8_t vceq(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vceq_u8 (v0, v1); }
+inline   uint8x8_t vceq(const int8x8_t    & v0, const int8x8_t    & v1) { return vceq_s8 (v0, v1); }
+inline  uint16x4_t vceq(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vceq_u16(v0, v1); }
+inline  uint16x4_t vceq(const int16x4_t   & v0, const int16x4_t   & v1) { return vceq_s16(v0, v1); }
+inline  uint32x2_t vceq(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vceq_u32(v0, v1); }
+inline  uint32x2_t vceq(const int32x2_t   & v0, const int32x2_t   & v1) { return vceq_s32(v0, v1); }
+inline  uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); }
+
+////////////////////////////// vcgtq ///////////////////////
+
+inline  uint8x16_t vcgtq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vcgtq_u8 (v0, v1); }
+inline  uint8x16_t vcgtq(const int8x16_t   & v0, const int8x16_t   & v1) { return vcgtq_s8 (v0, v1); }
+inline  uint16x8_t vcgtq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vcgtq_u16(v0, v1); }
+inline  uint16x8_t vcgtq(const int16x8_t   & v0, const int16x8_t   & v1) { return vcgtq_s16(v0, v1); }
+inline  uint32x4_t vcgtq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vcgtq_u32(v0, v1); }
+inline  uint32x4_t vcgtq(const int32x4_t   & v0, const int32x4_t   & v1) { return vcgtq_s32(v0, v1); }
+inline  uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); }
+
+////////////////////////////// vcgt ///////////////////////
+
+inline   uint8x8_t vcgt(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcgt_u8 (v0, v1); }
+inline   uint8x8_t vcgt(const int8x8_t    & v0, const int8x8_t    & v1) { return vcgt_s8 (v0, v1); }
+inline  uint16x4_t vcgt(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vcgt_u16(v0, v1); }
+inline  uint16x4_t vcgt(const int16x4_t   & v0, const int16x4_t   & v1) { return vcgt_s16(v0, v1); }
+inline  uint32x2_t vcgt(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vcgt_u32(v0, v1); }
+inline  uint32x2_t vcgt(const int32x2_t   & v0, const int32x2_t   & v1) { return vcgt_s32(v0, v1); }
+inline  uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); }
+
+////////////////////////////// vcgeq ///////////////////////
+
+inline  uint8x16_t vcgeq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vcgeq_u8 (v0, v1); }
+inline  uint8x16_t vcgeq(const int8x16_t   & v0, const int8x16_t   & v1) { return vcgeq_s8 (v0, v1); }
+inline  uint16x8_t vcgeq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vcgeq_u16(v0, v1); }
+inline  uint16x8_t vcgeq(const int16x8_t   & v0, const int16x8_t   & v1) { return vcgeq_s16(v0, v1); }
+inline  uint32x4_t vcgeq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vcgeq_u32(v0, v1); }
+inline  uint32x4_t vcgeq(const int32x4_t   & v0, const int32x4_t   & v1) { return vcgeq_s32(v0, v1); }
+inline  uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); }
+
+////////////////////////////// vcge ///////////////////////
+
+inline   uint8x8_t vcge(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcge_u8 (v0, v1); }
+inline   uint8x8_t vcge(const int8x8_t    & v0, const int8x8_t    & v1) { return vcge_s8 (v0, v1); }
+inline  uint16x4_t vcge(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vcge_u16(v0, v1); }
+inline  uint16x4_t vcge(const int16x4_t   & v0, const int16x4_t   & v1) { return vcge_s16(v0, v1); }
+inline  uint32x2_t vcge(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vcge_u32(v0, v1); }
+inline  uint32x2_t vcge(const int32x2_t   & v0, const int32x2_t   & v1) { return vcge_s32(v0, v1); }
+inline  uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); }
+
+////////////////////////////// vandq ///////////////////////
+
+inline  uint8x16_t vandq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vandq_u8 (v0, v1); }
+inline   int8x16_t vandq(const int8x16_t   & v0, const int8x16_t   & v1) { return vandq_s8 (v0, v1); }
+inline  uint16x8_t vandq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vandq_u16(v0, v1); }
+inline   int16x8_t vandq(const int16x8_t   & v0, const int16x8_t   & v1) { return vandq_s16(v0, v1); }
+inline  uint32x4_t vandq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vandq_u32(v0, v1); }
+inline   int32x4_t vandq(const int32x4_t   & v0, const int32x4_t   & v1) { return vandq_s32(v0, v1); }
+
+////////////////////////////// vand ///////////////////////
+
+inline   uint8x8_t vand(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vand_u8 (v0, v1); }
+inline    int8x8_t vand(const int8x8_t    & v0, const int8x8_t    & v1) { return vand_s8 (v0, v1); }
+inline  uint16x4_t vand(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vand_u16(v0, v1); }
+inline   int16x4_t vand(const int16x4_t   & v0, const int16x4_t   & v1) { return vand_s16(v0, v1); }
+inline  uint32x2_t vand(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vand_u32(v0, v1); }
+inline   int32x2_t vand(const int32x2_t   & v0, const int32x2_t   & v1) { return vand_s32(v0, v1); }
+
+////////////////////////////// vmovn ///////////////////////
+
+inline uint8x8_t   vmovn(const uint16x8_t  & v) { return vmovn_u16(v); }
+inline int8x8_t    vmovn(const int16x8_t   & v) { return vmovn_s16(v); }
+inline uint16x4_t  vmovn(const uint32x4_t  & v) { return vmovn_u32(v); }
+inline int16x4_t   vmovn(const int32x4_t   & v) { return vmovn_s32(v); }
+inline uint32x2_t  vmovn(const uint64x2_t  & v) { return vmovn_u64(v); }
+inline int32x2_t   vmovn(const int64x2_t   & v) { return vmovn_s64(v); }
+
+////////////////////////////// vqmovn ///////////////////////
+
+inline uint8x8_t   vqmovn(const uint16x8_t  & v) { return vqmovn_u16(v); }
+inline int8x8_t    vqmovn(const int16x8_t   & v) { return vqmovn_s16(v); }
+inline uint16x4_t  vqmovn(const uint32x4_t  & v) { return vqmovn_u32(v); }
+inline int16x4_t   vqmovn(const int32x4_t   & v) { return vqmovn_s32(v); }
+inline uint32x2_t  vqmovn(const uint64x2_t  & v) { return vqmovn_u64(v); }
+inline int32x2_t   vqmovn(const int64x2_t   & v) { return vqmovn_s64(v); }
+
+////////////////////////////// vmovl ///////////////////////
+
+inline uint16x8_t  vmovl(const uint8x8_t   & v) { return  vmovl_u8(v); }
+inline int16x8_t   vmovl(const int8x8_t    & v) { return  vmovl_s8(v); }
+inline uint32x4_t  vmovl(const uint16x4_t  & v) { return vmovl_u16(v); }
+inline int32x4_t   vmovl(const int16x4_t   & v) { return vmovl_s16(v); }
+
+////////////////////////////// vmvnq ///////////////////////
+
+inline uint8x16_t  vmvnq(const uint8x16_t  & v) { return vmvnq_u8 (v); }
+inline int8x16_t   vmvnq(const int8x16_t   & v) { return vmvnq_s8 (v); }
+inline uint16x8_t  vmvnq(const uint16x8_t  & v) { return vmvnq_u16(v); }
+inline int16x8_t   vmvnq(const int16x8_t   & v) { return vmvnq_s16(v); }
+inline uint32x4_t  vmvnq(const uint32x4_t  & v) { return vmvnq_u32(v); }
+inline int32x4_t   vmvnq(const int32x4_t   & v) { return vmvnq_s32(v); }
+
+////////////////////////////// vmvn ///////////////////////
+
+inline uint8x8_t   vmvn(const uint8x8_t   & v) { return vmvn_u8 (v); }
+inline int8x8_t    vmvn(const int8x8_t    & v) { return vmvn_s8 (v); }
+inline uint16x4_t  vmvn(const uint16x4_t  & v) { return vmvn_u16(v); }
+inline int16x4_t   vmvn(const int16x4_t   & v) { return vmvn_s16(v); }
+inline uint32x2_t  vmvn(const uint32x2_t  & v) { return vmvn_u32(v); }
+inline int32x2_t   vmvn(const int32x2_t   & v) { return vmvn_s32(v); }
+
+////////////////////////////// vbicq ///////////////////////
+
+inline  uint8x16_t vbicq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vbicq_u8 (v0, v1); }
+inline   int8x16_t vbicq(const int8x16_t   & v0, const int8x16_t   & v1) { return vbicq_s8 (v0, v1); }
+inline  uint16x8_t vbicq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vbicq_u16(v0, v1); }
+inline   int16x8_t vbicq(const int16x8_t   & v0, const int16x8_t   & v1) { return vbicq_s16(v0, v1); }
+inline  uint32x4_t vbicq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vbicq_u32(v0, v1); }
+inline   int32x4_t vbicq(const int32x4_t   & v0, const int32x4_t   & v1) { return vbicq_s32(v0, v1); }
+inline  uint64x2_t vbicq(const uint64x2_t  & v0, const uint64x2_t  & v1) { return vbicq_u64(v0, v1); }
+inline   int64x2_t vbicq(const int64x2_t   & v0, const int64x2_t   & v1) { return vbicq_s64(v0, v1); }
+
+////////////////////////////// vbic ///////////////////////
+
+inline   uint8x8_t vbic(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vbic_u8 (v0, v1); }
+inline    int8x8_t vbic(const int8x8_t    & v0, const int8x8_t    & v1) { return vbic_s8 (v0, v1); }
+inline  uint16x4_t vbic(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vbic_u16(v0, v1); }
+inline   int16x4_t vbic(const int16x4_t   & v0, const int16x4_t   & v1) { return vbic_s16(v0, v1); }
+inline  uint32x2_t vbic(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vbic_u32(v0, v1); }
+inline   int32x2_t vbic(const int32x2_t   & v0, const int32x2_t   & v1) { return vbic_s32(v0, v1); }
+inline  uint64x1_t vbic(const uint64x1_t  & v0, const uint64x1_t  & v1) { return vbic_u64(v0, v1); }
+inline   int64x1_t vbic(const int64x1_t   & v0, const int64x1_t   & v1) { return vbic_s64(v0, v1); }
+
+////////////////////////////// vtransform ///////////////////////
+
+template <typename Op>
+void vtransform(Size2D size,
+                const typename Op::type * src0Base, ptrdiff_t src0Stride,
+                const typename Op::type * src1Base, ptrdiff_t src1Stride,
+                typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op)
+{
+    typedef typename Op::type type;
+    typedef typename VecTraits<type>::vec128 vec128;
+    typedef typename VecTraits<type>::vec64 vec64;
+
+    if (src0Stride == src1Stride && src0Stride == dstStride &&
+        src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
+    {
+        size.width *= size.height;
+        size.height = 1;
+    }
+
+    const size_t step_base = 32 / sizeof(type);
+    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
+    const size_t step_tail = 8 / sizeof(type);
+    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
+
+    for (size_t y = 0; y < size.height; ++y)
+    {
+        const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
+        const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
+        typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y);
+        size_t x = 0;
+
+        for( ; x < roiw_base; x += step_base )
+        {
+            internal::prefetch(src0 + x);
+            internal::prefetch(src1 + x);
+
+            vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type));
+            vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type));
+            vec128 v_dst;
+
+            op(v_src00, v_src10, v_dst);
+            vst1q(dst + x, v_dst);
+
+            op(v_src01, v_src11, v_dst);
+            vst1q(dst + x + 16 / sizeof(type), v_dst);
+        }
+        for( ; x < roiw_tail; x += step_tail )
+        {
+            vec64 v_src0 = vld1(src0 + x);
+            vec64 v_src1 = vld1(src1 + x);
+            vec64 v_dst;
+
+            op(v_src0, v_src1, v_dst);
+            vst1(dst + x, v_dst);
+        }
+
+        for (; x < size.width; ++x)
+        {
+            op(src0 + x, src1 + x, dst + x);
+        }
+    }
+}
+
+} }
+
+#endif // CAROTENE_NEON
+
+#endif
--- a/3rdparty/carotene/src/warp_affine.cpp
+++ b/3rdparty/carotene/src/warp_affine.cpp
@ -0,0 +1,434 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+#include "remap.hpp"
+
+namespace CAROTENE_NS {
+
+bool isWarpAffineNearestNeighborSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+bool isWarpAffineLinearSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                               const u8 * srcBase, ptrdiff_t srcStride,
+                               const f32 * m,
+                               u8 * dstBase, ptrdiff_t dstStride,
+                               BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isWarpAffineNearestNeighborSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
+    s32 * map = alignPtr(_map, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride);
+    float32x4_t v_4 = vdupq_n_f32(4.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(m[0]);
+    float32x4_t v_m1 = vdupq_n_f32(m[1]);
+    float32x4_t v_m2 = vdupq_n_f32(m[2]);
+    float32x4_t v_m3 = vdupq_n_f32(m[3]);
+    float32x4_t v_m4 = vdupq_n_f32(m[4]);
+    float32x4_t v_m5 = vdupq_n_f32(m[5]);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                        int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
+                        int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
+                        int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
+                        vst1q_s32(map_row + x, v_src_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 src_x_f = m[0] * x_ + yx;
+                        f32 src_y_f = m[1] * x_ + yy;
+                        s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
+
+                        src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
+                        src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
+                        map_row[x] = src_y * srcStride + src_x;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                                        getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+                        uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
+                                                      vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
+                        int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x, v_src_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 src_x_f = m[0] * x_ + yx;
+                        f32 src_y_f = m[1] * x_ + yy;
+                        s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
+
+                        map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
+                                     (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                                    getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)m;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+void warpAffineLinear(const Size2D &ssize, const Size2D &dsize,
+                      const u8 * srcBase, ptrdiff_t srcStride,
+                      const f32 * m,
+                      u8 * dstBase, ptrdiff_t dstStride,
+                      BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isWarpAffineLinearSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
+    f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
+    s32 * map = alignPtr(_map, 16);
+    f32 * coeffs = alignPtr(_coeffs, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
+    float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(m[0]);
+    float32x4_t v_m1 = vdupq_n_f32(m[1]);
+    float32x4_t v_m2 = vdupq_n_f32(m[2]);
+    float32x4_t v_m3 = vdupq_n_f32(m[3]);
+    float32x4_t v_m4 = vdupq_n_f32(m[4]);
+    float32x4_t v_m5 = vdupq_n_f32(m[5]);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
+                    float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
+                        v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
+                        v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
+
+                        int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
+                        int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
+                        int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
+                        int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 src_x_f = m[0] * x_ + yx;
+                        f32 src_y_f = m[1] * x_ + yy;
+
+                        s32 src0_x = (s32)floorf(src_x_f);
+                        s32 src0_y = (s32)floorf(src_y_f);
+
+                        coeff_row[(x << 1) + 0] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
+                        src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
+                        s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
+                        src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
+
+                        map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
+                        map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
+                        map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
+                        map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
+                    }
+                }
+
+                remapLinearReplicate(Size2D(blockWidth, blockHeight),
+                                     srcBase, &map[0], &coeffs[0],
+                                     getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
+                    float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                        int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
+                        v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
+                        v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
+
+                        int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
+                        int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
+
+                        uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
+                        uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
+                        uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
+                        uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
+
+                        v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
+                        v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
+                        v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
+                        v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 src_x_f = m[0] * x_ + yx;
+                        f32 src_y_f = m[1] * x_ + yy;
+
+                        s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
+                        s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
+
+                        coeff_row[(x << 1) + 0] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
+                        map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
+                    }
+                }
+
+                remapLinearConst(Size2D(blockWidth, blockHeight),
+                                 srcBase, &map[0], &coeffs[0],
+                                 getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)m;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/carotene/src/warp_perspective.cpp
+++ b/3rdparty/carotene/src/warp_perspective.cpp
@ -0,0 +1,464 @@
+/*
+ * By downloading, copying, installing or using the software you agree to this license.
+ * If you do not agree to this license, do not download, install,
+ * copy or use the software.
+ *
+ *
+ *                           License Agreement
+ *                For Open Source Computer Vision Library
+ *                        (3-clause BSD License)
+ *
+ * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
+ * Third party copyrights are property of their respective owners.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above copyright notice,
+ *     this list of conditions and the following disclaimer in the documentation
+ *     and/or other materials provided with the distribution.
+ *
+ *   * Neither the names of the copyright holders nor the names of the contributors
+ *     may be used to endorse or promote products derived from this software
+ *     without specific prior written permission.
+ *
+ * This software is provided by the copyright holders and contributors "as is" and
+ * any express or implied warranties, including, but not limited to, the implied
+ * warranties of merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall copyright holders or contributors be liable for any direct,
+ * indirect, incidental, special, exemplary, or consequential damages
+ * (including, but not limited to, procurement of substitute goods or services;
+ * loss of use, data, or profits; or business interruption) however caused
+ * and on any theory of liability, whether in contract, strict liability,
+ * or tort (including negligence or otherwise) arising in any way out of
+ * the use of this software, even if advised of the possibility of such damage.
+ */
+
+
+
+#include "remap.hpp"
+
+namespace CAROTENE_NS {
+
+bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+bool isWarpPerspectiveLinearSupported(const Size2D &ssize)
+{
+#if SIZE_MAX > UINT32_MAX
+    return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
+                                                                       // is performed with u32
+           isSupportedConfiguration();
+#else
+    (void)ssize;
+    return isSupportedConfiguration();
+#endif
+}
+
+void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
+                                    const u8 * srcBase, ptrdiff_t srcStride,
+                                    const f32 * m,
+                                    u8 * dstBase, ptrdiff_t dstStride,
+                                    BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isWarpPerspectiveNearestNeighborSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
+    s32 * map = alignPtr(_map, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride);
+    float32x4_t v_4 = vdupq_n_f32(4.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(m[0]);
+    float32x4_t v_m1 = vdupq_n_f32(m[1]);
+    float32x4_t v_m2 = vdupq_n_f32(m[2]);
+    float32x4_t v_m3 = vdupq_n_f32(m[3]);
+    float32x4_t v_m4 = vdupq_n_f32(m[4]);
+    float32x4_t v_m5 = vdupq_n_f32(m[5]);
+    float32x4_t v_m6 = vdupq_n_f32(m[6]);
+    float32x4_t v_m7 = vdupq_n_f32(m[7]);
+    float32x4_t v_m8 = vdupq_n_f32(m[8]);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
+                        v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+                        float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+                        v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                        v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                        int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
+                        int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
+                        int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
+                        vst1q_s32(map_row + x, v_src_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 w_f = 1.0f / (m[2] * x_ + yw);
+                        f32 src_x_f = (m[0] * x_ + yx) * w_f;
+                        f32 src_y_f = (m[1] * x_ + yy) * w_f;
+                        s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
+
+                        src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
+                        src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
+                        map_row[x] = src_y * srcStride + src_x;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                              getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
+                        v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+                        float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+                        v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                        v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+                        uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
+                                                      vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
+                        int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
+                        vst1q_s32(map_row + x, v_src_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 w_f = 1.0f / (m[2] * x_ + yw);
+                        f32 src_x_f = (m[0] * x_ + yx) * w_f;
+                        f32 src_y_f = (m[1] * x_ + yy) * w_f;
+                        s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
+
+                        map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
+                                     (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
+                    }
+                }
+
+                // make remap
+                remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
+                                          getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)m;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize,
+                           const u8 * srcBase, ptrdiff_t srcStride,
+                           const f32 * m,
+                           u8 * dstBase, ptrdiff_t dstStride,
+                           BORDER_MODE borderMode, u8 borderValue)
+{
+    internal::assertSupportedConfiguration(isWarpPerspectiveLinearSupported(ssize));
+#ifdef CAROTENE_NEON
+    using namespace internal;
+
+    s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
+    f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
+    s32 * map = alignPtr(_map, 16);
+    f32 * coeffs = alignPtr(_coeffs, 16);
+
+    int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
+    float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
+
+    float32x4_t v_4 = vdupq_n_f32(4.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(m[0]);
+    float32x4_t v_m1 = vdupq_n_f32(m[1]);
+    float32x4_t v_m2 = vdupq_n_f32(m[2]);
+    float32x4_t v_m3 = vdupq_n_f32(m[3]);
+    float32x4_t v_m4 = vdupq_n_f32(m[4]);
+    float32x4_t v_m5 = vdupq_n_f32(m[5]);
+    float32x4_t v_m6 = vdupq_n_f32(m[6]);
+    float32x4_t v_m7 = vdupq_n_f32(m[7]);
+    float32x4_t v_m8 = vdupq_n_f32(m[8]);
+
+    if (borderMode == BORDER_MODE_REPLICATE)
+    {
+        int32x4_t v_zero4 = vdupq_n_s32(0);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
+                        v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+                        float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+                        v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                        v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                        int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
+                        v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
+                        v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
+
+                        int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
+                        int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
+                        int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
+                        int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 w_f = 1.0f / (m[2] * x_ + yw);
+                        f32 src_x_f = (m[0] * x_ + yx) * w_f;
+                        f32 src_y_f = (m[1] * x_ + yy) * w_f;
+
+                        s32 src0_x = (s32)floorf(src_x_f);
+                        s32 src0_y = (s32)floorf(src_y_f);
+
+                        coeff_row[(x << 1) + 0] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
+                        src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
+                        s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
+                        src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
+
+                        map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
+                        map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
+                        map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
+                        map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
+                    }
+                }
+
+                remapLinearReplicate(Size2D(blockWidth, blockHeight),
+                                     srcBase, &map[0], &coeffs[0],
+                                     getRowPtr(dstBase, dstStride, i) + j, dstStride);
+            }
+        }
+    }
+    else if (borderMode == BORDER_MODE_CONSTANT)
+    {
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+
+        for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
+        {
+            size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
+            for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
+            {
+                size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
+
+                // compute table
+                for (size_t y = 0; y < blockHeight; ++y)
+                {
+                    s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
+                    f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
+
+                    size_t x = 0, y_ = y + i;
+                    f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+                    float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+                    float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
+                        v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+                    for ( ; x + 4 <= blockWidth; x += 4)
+                    {
+                        float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                        float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+                        float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+                        v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                        v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                        int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
+                        int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
+
+                        float32x4x2_t v_coeff;
+                        v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
+                        v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
+                        uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                        uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                        v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                        v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                        v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
+                        v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
+
+                        int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
+                        int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
+
+                        int32x4x4_t v_dst_index;
+                        v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
+                        v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
+                        v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
+                        v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
+
+                        uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
+                        uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
+                        uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
+                        uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
+
+                        v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
+                        v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
+                        v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
+                        v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
+
+                        vst2q_f32(coeff_row + (x << 1), v_coeff);
+                        vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                        v_x = vaddq_f32(v_x, v_4);
+                    }
+
+                    f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
+                    for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
+                    {
+                        f32 w_f = 1.0f / (m[2] * x_ + yw);
+                        f32 src_x_f = (m[0] * x_ + yx) * w_f;
+                        f32 src_y_f = (m[1] * x_ + yy) * w_f;
+
+                        s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
+                        s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
+
+                        coeff_row[(x << 1) + 0] = src_x_f - src0_x;
+                        coeff_row[(x << 1) + 1] = src_y_f - src0_y;
+
+                        map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
+                        map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
+                        map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
+                                                (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
+                    }
+                }
+
+                remapLinearConst(Size2D(blockWidth, blockHeight),
+                                 srcBase, &map[0], &coeffs[0],
+                                 getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
+            }
+        }
+    }
+#else
+    (void)ssize;
+    (void)dsize;
+    (void)srcBase;
+    (void)srcStride;
+    (void)m;
+    (void)dstBase;
+    (void)dstStride;
+    (void)borderMode;
+    (void)borderValue;
+#endif
+}
+
+} // namespace CAROTENE_NS
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@ -5,9 +5,9 @@ if (WIN32 AND NOT ARM)
  message(FATAL_ERROR "BUILD_TBB option supports Windows on ARM only!\nUse regular official TBB build instead of the BUILD_TBB option!")
 endif()

-set(tbb_ver "tbb43_20141204oss")
-set(tbb_url "http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb43_20141204oss_src.tgz")
-set(tbb_md5 "e903dd92d9433701f097fa7ca29a3c1f")
+set(tbb_ver "tbb44_20160128oss")
+set(tbb_url "http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb44_20160128oss_src_0.tgz")
+set(tbb_md5 "9d8a4cdf43496f1b3f7c473a5248e5cc")
 set(tbb_version_file "version_string.ver")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4702)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
--- a/3rdparty/zlib/CMakeLists.txt
+++ b/3rdparty/zlib/CMakeLists.txt
@ -82,7 +82,7 @@ if(UNIX)
  endif()
 endif()

-ocv_warnings_disable(CMAKE_C_FLAGS -Wshorten-64-to-32 -Wattributes -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wshorten-64-to-32 -Wattributes -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations -Wshift-negative-value)

 set_target_properties(${ZLIB_LIBRARY} PROPERTIES
        OUTPUT_NAME ${ZLIB_LIBRARY}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -68,6 +68,10 @@ if(WINRT)
  endif()
 endif()

+if(POLICY CMP0020)
+  cmake_policy(SET CMP0020 OLD)
+endif()
+
 if(POLICY CMP0022)
  cmake_policy(SET CMP0022 OLD)
 endif()
@ -77,13 +81,14 @@ if(POLICY CMP0026)
  cmake_policy(SET CMP0026 OLD)
 endif()

-if (POLICY CMP0042)
-  # silence cmake 3.0+ warnings about MACOSX_RPATH
-  cmake_policy(SET CMP0042 OLD)
+if(POLICY CMP0042)
+  cmake_policy(SET CMP0042 NEW)
 endif()

+include(cmake/OpenCVUtils.cmake)
+
 # must go before the project command
-set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE)
+ocv_update(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE)
 if(DEFINED CMAKE_BUILD_TYPE)
  set_property( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES} )
 endif()
@ -96,7 +101,7 @@ if(MSVC)
  set(CMAKE_USE_RELATIVE_PATHS ON CACHE INTERNAL "" FORCE)
 endif()

-include(cmake/OpenCVUtils.cmake)
+ocv_cmake_eval(DEBUG_PRE ONCE)

 ocv_clear_vars(OpenCVModules_TARGETS)

@ -164,6 +169,7 @@ endif()
 OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
+OCV_OPTION(WITH_CAROTENE       "Use NVidia carotene acceleration library for ARM platform"                   ON  IF (ARM OR AARCH64) AND NOT IOS AND NOT (CMAKE_VERSION VERSION_LESS "2.8.11"))
 OCV_OPTION(WITH_VTK            "Include VTK library support (and build opencv_viz module eiher)"             ON  IF (NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING) )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"                                         ON  IF (NOT IOS AND NOT WINRT) )
 OCV_OPTION(WITH_CUFFT          "Include NVidia Cuda Fast Fourier Transform (FFT) library support"            ON  IF (NOT IOS AND NOT WINRT) )
@ -185,8 +191,8 @@ OCV_OPTION(WITH_OPENGL         "Include OpenGL support"                      OFF
 OCV_OPTION(WITH_OPENNI         "Include OpenNI support"                      OFF  IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
 OCV_OPTION(WITH_OPENNI2        "Include OpenNI2 support"                     OFF  IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
 OCV_OPTION(WITH_PNG            "Include PNG support"                         ON)
-OCV_OPTION(WITH_PVAPI          "Include Prosilica GigE support"              ON   IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
-OCV_OPTION(WITH_GIGEAPI        "Include Smartek GigE support"                ON   IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
+OCV_OPTION(WITH_PVAPI          "Include Prosilica GigE support"              OFF   IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
+OCV_OPTION(WITH_GIGEAPI        "Include Smartek GigE support"                OFF   IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
 OCV_OPTION(WITH_QT             "Build with Qt Backend support"               OFF  IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
 OCV_OPTION(WITH_WIN32UI        "Build with Win32 UI Backend support"         ON   IF WIN32 AND NOT WINRT)
 OCV_OPTION(WITH_QUICKTIME      "Use QuickTime for Video I/O insted of QTKit" OFF  IF APPLE )
@ -215,6 +221,7 @@ OCV_OPTION(WITH_VA             "Include VA support"                          OFF
 OCV_OPTION(WITH_VA_INTEL       "Include Intel VA-API/OpenCL support"         OFF  IF (UNIX AND NOT ANDROID) )
 OCV_OPTION(WITH_GDAL           "Include GDAL Support"                        OFF  IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
 OCV_OPTION(WITH_GPHOTO2        "Include gPhoto2 library support"             ON   IF (UNIX AND NOT ANDROID) )
+OCV_OPTION(WITH_LAPACK         "Include Lapack library support"              ON   IF (UNIX AND NOT ANDROID) )

 # OpenCV build components
 # ===================================================
@ -253,7 +260,7 @@ OCV_OPTION(INSTALL_TESTS            "Install accuracy and performance test binar

 # OpenCV build options
 # ===================================================
-OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  ON   IF (NOT IOS) )
+OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  ON IF (NOT IOS AND NOT CMAKE_CROSSCOMPILING) )
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(ENABLE_COVERAGE            "Enable coverage collection with  GCov"                    OFF  IF CMAKE_COMPILER_IS_GNUCXX )
@ -297,50 +304,50 @@ include(cmake/OpenCVVersion.cmake)
 # ----------------------------------------------------------------------------

 # Save libs and executables in the same place
-set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications" )
+set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications")

-if (ANDROID)
-  if (ANDROID_ABI MATCHES "NEON")
+if(ANDROID)
+  if(ANDROID_ABI MATCHES "NEON")
    set(ENABLE_NEON ON)
  endif()
-  if (ANDROID_ABI MATCHES "VFPV3")
+  if(ANDROID_ABI MATCHES "VFPV3")
    set(ENABLE_VFPV3 ON)
  endif()
 endif()

 if(ANDROID OR WIN32)
-  set(OPENCV_DOC_INSTALL_PATH doc)
+  ocv_update(OPENCV_DOC_INSTALL_PATH doc)
 else()
-  set(OPENCV_DOC_INSTALL_PATH share/OpenCV/doc)
+  ocv_update(OPENCV_DOC_INSTALL_PATH share/OpenCV/doc)
 endif()

 if(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
  if(DEFINED OpenCV_RUNTIME AND DEFINED OpenCV_ARCH)
-    set(OpenCV_INSTALL_BINARIES_PREFIX "${OpenCV_ARCH}/${OpenCV_RUNTIME}/")
+    ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "${OpenCV_ARCH}/${OpenCV_RUNTIME}/")
  else()
    message(STATUS "Can't detect runtime and/or arch")
-    set(OpenCV_INSTALL_BINARIES_PREFIX "")
+    ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "")
  endif()
 elseif(ANDROID)
-  set(OpenCV_INSTALL_BINARIES_PREFIX "sdk/native/")
+  ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "sdk/native/")
 else()
-  set(OpenCV_INSTALL_BINARIES_PREFIX "")
+  ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "")
 endif()

 if(ANDROID)
-  set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples/${ANDROID_NDK_ABI_NAME}")
+  ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples/${ANDROID_NDK_ABI_NAME}")
 else()
-  set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples")
+  ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples")
 endif()

 if(ANDROID)
-  set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin/${ANDROID_NDK_ABI_NAME}")
+  ocv_update(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin/${ANDROID_NDK_ABI_NAME}")
 else()
-  set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin")
+  ocv_update(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin")
 endif()

 if(NOT OPENCV_TEST_INSTALL_PATH)
-  set(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}")
+  ocv_update(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}")
 endif()

 if (OPENCV_TEST_DATA_PATH)
@ -349,66 +356,74 @@ endif()

 if(OPENCV_TEST_DATA_PATH AND NOT OPENCV_TEST_DATA_INSTALL_PATH)
  if(ANDROID)
-    set(OPENCV_TEST_DATA_INSTALL_PATH "sdk/etc/testdata")
+    ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "sdk/etc/testdata")
  elseif(WIN32)
-    set(OPENCV_TEST_DATA_INSTALL_PATH "testdata")
+    ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "testdata")
  else()
-    set(OPENCV_TEST_DATA_INSTALL_PATH "share/OpenCV/testdata")
+    ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "share/OpenCV/testdata")
  endif()
 endif()

 if(ANDROID)
-  set(LIBRARY_OUTPUT_PATH         "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}")
-  set(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib/${ANDROID_NDK_ABI_NAME}")
-  set(OPENCV_LIB_INSTALL_PATH     sdk/native/libs/${ANDROID_NDK_ABI_NAME})
-  set(OPENCV_3P_LIB_INSTALL_PATH  sdk/native/3rdparty/libs/${ANDROID_NDK_ABI_NAME})
-  set(OPENCV_CONFIG_INSTALL_PATH  sdk/native/jni)
-  set(OPENCV_INCLUDE_INSTALL_PATH sdk/native/jni/include)
-  set(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native)
-  set(OPENCV_OTHER_INSTALL_PATH   sdk/etc)
+  set(LIBRARY_OUTPUT_PATH                "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}")
+  ocv_update(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib/${ANDROID_NDK_ABI_NAME}")
+  ocv_update(OPENCV_LIB_INSTALL_PATH     sdk/native/libs/${ANDROID_NDK_ABI_NAME})
+  ocv_update(OPENCV_3P_LIB_INSTALL_PATH  sdk/native/3rdparty/libs/${ANDROID_NDK_ABI_NAME})
+  ocv_update(OPENCV_CONFIG_INSTALL_PATH  sdk/native/jni)
+  ocv_update(OPENCV_INCLUDE_INSTALL_PATH sdk/native/jni/include)
+  ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native)
+  ocv_update(OPENCV_OTHER_INSTALL_PATH   sdk/etc)
 else()
-  set(LIBRARY_OUTPUT_PATH         "${OpenCV_BINARY_DIR}/lib")
-  set(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib${LIB_SUFFIX}")
+  set(LIBRARY_OUTPUT_PATH                "${OpenCV_BINARY_DIR}/lib")
+  ocv_update(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib${LIB_SUFFIX}")

  if(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
    if(OpenCV_STATIC)
-      set(OPENCV_LIB_INSTALL_PATH   "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
+      ocv_update(OPENCV_LIB_INSTALL_PATH   "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
    else()
-      set(OPENCV_LIB_INSTALL_PATH   "${OpenCV_INSTALL_BINARIES_PREFIX}lib${LIB_SUFFIX}")
+      ocv_update(OPENCV_LIB_INSTALL_PATH   "${OpenCV_INSTALL_BINARIES_PREFIX}lib${LIB_SUFFIX}")
    endif()
-    set(OPENCV_3P_LIB_INSTALL_PATH  "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
-    set(OPENCV_SAMPLES_SRC_INSTALL_PATH    samples/native)
-    set(OPENCV_JAR_INSTALL_PATH java)
-    set(OPENCV_OTHER_INSTALL_PATH   etc)
+    ocv_update(OPENCV_3P_LIB_INSTALL_PATH  "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
+    ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH    samples/native)
+    ocv_update(OPENCV_JAR_INSTALL_PATH java)
+    ocv_update(OPENCV_OTHER_INSTALL_PATH   etc)
+    ocv_update(OPENCV_CONFIG_INSTALL_PATH  ".")
  else()
-    set(OPENCV_LIB_INSTALL_PATH     lib${LIB_SUFFIX})
-    set(OPENCV_3P_LIB_INSTALL_PATH  share/OpenCV/3rdparty/${OPENCV_LIB_INSTALL_PATH})
-    set(OPENCV_SAMPLES_SRC_INSTALL_PATH    share/OpenCV/samples)
-    set(OPENCV_JAR_INSTALL_PATH share/OpenCV/java)
-    set(OPENCV_OTHER_INSTALL_PATH   share/OpenCV)
-  endif()
-  set(OPENCV_INCLUDE_INSTALL_PATH "include")
+    ocv_update(OPENCV_LIB_INSTALL_PATH     lib${LIB_SUFFIX})
+    ocv_update(OPENCV_3P_LIB_INSTALL_PATH  share/OpenCV/3rdparty/${OPENCV_LIB_INSTALL_PATH})
+    ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH    share/OpenCV/samples)
+    ocv_update(OPENCV_JAR_INSTALL_PATH share/OpenCV/java)
+    ocv_update(OPENCV_OTHER_INSTALL_PATH   share/OpenCV)

-  math(EXPR SIZEOF_VOID_P_BITS "8 * ${CMAKE_SIZEOF_VOID_P}")
-  if(LIB_SUFFIX AND NOT SIZEOF_VOID_P_BITS EQUAL LIB_SUFFIX)
-    set(OPENCV_CONFIG_INSTALL_PATH lib${LIB_SUFFIX}/cmake/opencv)
-  else()
-    set(OPENCV_CONFIG_INSTALL_PATH share/OpenCV)
+    if(NOT DEFINED OPENCV_CONFIG_INSTALL_PATH)
+      math(EXPR SIZEOF_VOID_P_BITS "8 * ${CMAKE_SIZEOF_VOID_P}")
+      if(LIB_SUFFIX AND NOT SIZEOF_VOID_P_BITS EQUAL LIB_SUFFIX)
+        ocv_update(OPENCV_CONFIG_INSTALL_PATH lib${LIB_SUFFIX}/cmake/opencv)
+      else()
+        ocv_update(OPENCV_CONFIG_INSTALL_PATH share/OpenCV)
+      endif()
+    endif()
  endif()
+  ocv_update(OPENCV_INCLUDE_INSTALL_PATH "include")
 endif()

-set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_LIB_INSTALL_PATH}")
+ocv_update(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_LIB_INSTALL_PATH}")
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

 if(INSTALL_TO_MANGLED_PATHS)
  set(OPENCV_INCLUDE_INSTALL_PATH ${OPENCV_INCLUDE_INSTALL_PATH}/opencv-${OPENCV_VERSION})
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_3P_LIB_INSTALL_PATH "${OPENCV_3P_LIB_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_SAMPLES_SRC_INSTALL_PATH "${OPENCV_SAMPLES_SRC_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_CONFIG_INSTALL_PATH "${OPENCV_CONFIG_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_DOC_INSTALL_PATH "${OPENCV_DOC_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_JAR_INSTALL_PATH "${OPENCV_JAR_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_TEST_DATA_INSTALL_PATH "${OPENCV_TEST_DATA_INSTALL_PATH}")
-  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_OTHER_INSTALL_PATH "${OPENCV_OTHER_INSTALL_PATH}")
+  foreach(v
+      OPENCV_3P_LIB_INSTALL_PATH
+      OPENCV_SAMPLES_SRC_INSTALL_PATH
+      OPENCV_CONFIG_INSTALL_PATH
+      OPENCV_DOC_INSTALL_PATH
+      OPENCV_JAR_INSTALL_PATH
+      OPENCV_TEST_DATA_INSTALL_PATH
+      OPENCV_OTHER_INSTALL_PATH
+    )
+    string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" ${v} "${${v}}")
+    string(REPLACE "opencv" "opencv-${OPENCV_VERSION}" ${v} "${${v}}")
+  endforeach()
 endif()


@ -433,7 +448,7 @@ endif()
 # ----------------------------------------------------------------------------
 #  Path for build/platform -specific headers
 # ----------------------------------------------------------------------------
-set(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to create the platform-dependant cvconfig.h")
+ocv_update(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to create the platform-dependant cvconfig.h")
 ocv_include_directories(${OPENCV_CONFIG_FILE_INCLUDE_DIR})

 # ----------------------------------------------------------------------------
@ -446,7 +461,7 @@ set(OPENCV_EXTRA_MODULES_PATH "" CACHE PATH "Where to look for additional OpenCV
 # ----------------------------------------------------------------------------
 find_host_package(Git QUIET)

-if(GIT_FOUND)
+if(NOT DEFINED OPENCV_VCSVERSION AND GIT_FOUND)
  execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9].[0-9]*"
    WORKING_DIRECTORY "${OpenCV_SOURCE_DIR}"
    OUTPUT_VARIABLE OPENCV_VCSVERSION
@ -457,7 +472,7 @@ if(GIT_FOUND)
  if(NOT GIT_RESULT EQUAL 0)
    set(OPENCV_VCSVERSION "unknown")
  endif()
-else()
+elseif(NOT DEFINED OPENCV_VCSVERSION)
  # We don't have git:
  set(OPENCV_VCSVERSION "unknown")
 endif()
@ -596,30 +611,56 @@ endif()

 include(cmake/OpenCVDetectVTK.cmake)

-# -- Custom HAL replacement --
-set(_includes "")
-# assuming OPENCV_HAL_HEADERS and OPENCV_HAL_LIBS are lists of files:
-# option example: -DOPENCV_HAL_HEADERS="<some-path>/header1.h;<some-path>/header2.h"
-if (OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS)
-  foreach (h ${OPENCV_HAL_HEADERS})
-    get_filename_component(h "${h}" ABSOLUTE)
-    set(_includes "${_includes}\n#include \"${h}\"")
+
+# ----------------------------------------------------------------------------
+# OpenCV HAL
+# ----------------------------------------------------------------------------
+set(_hal_includes "")
+macro(ocv_hal_register HAL_LIBRARIES_VAR HAL_HEADERS_VAR HAL_INCLUDE_DIRS_VAR)
+  # 1. libraries
+  foreach (l ${${HAL_LIBRARIES_VAR}})
+    if(NOT TARGET ${l})
+      get_filename_component(l "${l}" ABSOLUTE)
+    endif()
+    list(APPEND OPENCV_HAL_LINKER_LIBS ${l})
  endforeach()
-  foreach (l ${OPENCV_HAL_LIBS})
-    get_filename_component(l "${l}" ABSOLUTE)
-    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${l})
-    # TODO: install?
-    # ocv_install_target(${l} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+  # 2. headers
+  foreach (h ${${HAL_HEADERS_VAR}})
+    set(_hal_includes "${_hal_includes}\n#include \"${h}\"")
  endforeach()
-else()
-  set(_includes "// using default HAL")
-  unset(OPENCV_HAL_HEADERS CACHE)
-  unset(OPENCV_HAL_LIBS CACHE)
+  # 3. include paths
+  ocv_include_directories(${${HAL_INCLUDE_DIRS_VAR}})
+endmacro()
+
+if(NOT DEFINED OpenCV_HAL)
+  set(OpenCV_HAL "OpenCV_HAL")
 endif()
-set(OPENCV_HAL_HEADERS "${OPENCV_HAL_HEADERS}" CACHE STRING "Headers with custom HAL implementation")
-set(OPENCV_HAL_LIBS "${OPENCV_HAL_LIBS}" CACHE STRING "Libraries with custom HAL implementation")
+
+if(WITH_CAROTENE)
+  ocv_debug_message(STATUS "Enable carotene acceleration")
+  if(NOT ";${OpenCV_HAL};" MATCHES ";carotene;")
+    set(OpenCV_HAL "carotene;${OpenCV_HAL}")
+  endif()
+endif()
+
+foreach(hal ${OpenCV_HAL})
+  if(hal STREQUAL "carotene")
+    add_subdirectory(3rdparty/carotene/hal)
+    ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
+    list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})")
+  else()
+    ocv_debug_message(STATUS "OpenCV HAL: ${hal} ...")
+    ocv_clear_vars(OpenCV_HAL_LIBRARIES OpenCV_HAL_HEADERS OpenCV_HAL_INCLUDE_DIRS)
+    find_package(${hal} NO_MODULE QUIET)
+    if(${hal}_FOUND)
+      ocv_hal_register(OpenCV_HAL_LIBRARIES OpenCV_HAL_HEADERS OpenCV_HAL_INCLUDE_DIRS)
+      list(APPEND OpenCV_USED_HAL "${hal} (ver ${${hal}_VERSION})")
+    endif()
+  endif()
+endforeach()
 configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY)
-unset(_includes)
+unset(_hal_includes)
+

 # ----------------------------------------------------------------------------
 # Add CUDA libraries (needed for apps/tools, samples)
@ -633,7 +674,7 @@ if(HAVE_CUDA)
    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY})
  endif()
  foreach(p ${CUDA_LIBS_PATH})
-    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} -L${p})
+    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CMAKE_LIBRARY_PATH_FLAG}${p})
  endforeach()
 endif()
 # ----------------------------------------------------------------------------
@ -691,7 +732,7 @@ include(cmake/OpenCVGenPkgconfig.cmake)
 # Generate OpenCV.mk for ndk-build (Android build tool)
 include(cmake/OpenCVGenAndroidMK.cmake)

-# Generate OpenCVСonfig.cmake and OpenCVConfig-version.cmake for cmake projects
+# Generate OpenCVConfig.cmake and OpenCVConfig-version.cmake for cmake projects
 include(cmake/OpenCVGenConfig.cmake)

 # Generate Info.plist for the IOS framework
@ -709,7 +750,7 @@ if(INSTALL_TESTS AND OPENCV_TEST_DATA_PATH)
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/opencv_run_all_tests_android.sh.in"
                   "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh" @ONLY)
    install(PROGRAMS "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh"
-            DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT tests)
+            DESTINATION ./ COMPONENT tests)
  elseif(WIN32)
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/opencv_run_all_tests_windows.cmd.in"
                   "${CMAKE_BINARY_DIR}/win-install/opencv_run_all_tests.cmd" @ONLY)
@ -737,11 +778,11 @@ endif()
 if(ANDROID OR NOT UNIX)
  install(FILES ${OPENCV_LICENSE_FILE}
        PERMISSIONS OWNER_READ GROUP_READ WORLD_READ
-        DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT libs)
+        DESTINATION ./ COMPONENT libs)
  if(OPENCV_README_FILE)
    install(FILES ${OPENCV_README_FILE}
            PERMISSIONS OWNER_READ GROUP_READ WORLD_READ
-            DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT libs)
+            DESTINATION ./ COMPONENT libs)
  endif()
 endif()

@ -753,10 +794,46 @@ status("General configuration for OpenCV ${OPENCV_VERSION} =====================
 if(OPENCV_VCSVERSION)
  status("  Version control:" ${OPENCV_VCSVERSION})
 endif()
+if(OPENCV_EXTRA_MODULES_PATH AND NOT BUILD_INFO_SKIP_EXTRA_MODULES)
+  set(__dump_extra_header OFF)
+  foreach(p ${OPENCV_EXTRA_MODULES_PATH})
+    if(EXISTS ${p})
+      if(NOT __dump_extra_header)
+        set(__dump_extra_header ON)
+        status("")
+        status("  Extra modules:")
+      else()
+        status("")
+      endif()
+      set(EXTRA_MODULES_VCSVERSION "unknown")
+      if(GIT_FOUND)
+        execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9].[0-9]*"
+          WORKING_DIRECTORY "${p}"
+          OUTPUT_VARIABLE EXTRA_MODULES_VCSVERSION
+          RESULT_VARIABLE GIT_RESULT
+          ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE
+        )
+        if(NOT GIT_RESULT EQUAL 0)
+          set(EXTRA_MODULES_VCSVERSION "unknown")
+        endif()
+      endif()
+      status("    Location (extra):" ${p})
+      status("    Version control (extra):" ${EXTRA_MODULES_VCSVERSION})
+    endif()
+  endforeach()
+  unset(__dump_extra_header)
+endif()

 # ========================== build platform ==========================
 status("")
 status("  Platform:")
+if(NOT CMAKE_VERSION VERSION_LESS 2.8.11 AND NOT BUILD_INFO_SKIP_TIMESTAMP)
+  string(TIMESTAMP TIMESTAMP "" UTC)
+  if(TIMESTAMP)
+    status("    Timestamp:"    ${TIMESTAMP})
+  endif()
+endif()
 status("    Host:"             ${CMAKE_HOST_SYSTEM_NAME} ${CMAKE_HOST_SYSTEM_VERSION} ${CMAKE_HOST_SYSTEM_PROCESSOR})
 if(CMAKE_CROSSCOMPILING)
  status("    Target:"         ${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_VERSION} ${CMAKE_SYSTEM_PROCESSOR})
@ -1119,10 +1196,14 @@ if(DEFINED WITH_VA_INTEL)
 status("    Use Intel VA-API/OpenCL:"  HAVE_VA_INTEL       THEN "YES (MSDK: ${VA_INTEL_MSDK_ROOT}  OpenCL: ${VA_INTEL_IOCL_ROOT})" ELSE NO)
 endif(DEFINED WITH_VA_INTEL)

+if(DEFINED WITH_LAPACK)
+status("    Use Lapack:"      HAVE_LAPACK     THEN "YES" ELSE NO)
+endif(DEFINED WITH_LAPACK)
+
 status("    Use Eigen:"      HAVE_EIGEN       THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
 status("    Use Cuda:"       HAVE_CUDA        THEN "YES (ver ${CUDA_VERSION_STRING})" ELSE NO)
 status("    Use OpenCL:"     HAVE_OPENCL      THEN YES ELSE NO)
-status("    Use custom HAL:" OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS THEN "YES (${OPENCV_HAL_HEADERS}; ${OPENCV_HAL_LIBS})" ELSE "NO")
+status("    Use custom HAL:" OpenCV_USED_HAL  THEN "YES (${OpenCV_USED_HAL})" ELSE "NO")

 if(HAVE_CUDA)
  status("")
@ -1138,14 +1219,13 @@ endif()

 if(HAVE_OPENCL)
  status("")
-  status("  OpenCL:")
  if(HAVE_OPENCL_STATIC)
-    set(__opencl_ver "static")
+    set(__opencl_type "<Link with OpenCL library>")
  else()
-    set(__opencl_ver "dynamic")
+    set(__opencl_type "<Dynamic loading of OpenCL library>")
  endif()
-  status("    Version:"       ${__opencl_ver})
-  if(OPENCL_INCLUDE_DIR)
+  status("  OpenCL:"                 ${__opencl_type})
+  if(OPENCL_INCLUDE_DIRS)
    status("    Include path:"       ${OPENCL_INCLUDE_DIRS})
  endif()
  if(OPENCL_LIBRARIES)
@ -1162,7 +1242,7 @@ if(HAVE_OPENCL)
        list(APPEND __libs "${l}")
      endif()
    endforeach()
-    status("    libraries:"          ${__libs})
+    status("    Link libraries:"          ${__libs})
  endif()
  status("    Use AMDFFT:"           HAVE_CLAMDFFT  THEN YES ELSE NO)
  status("    Use AMDBLAS:"          HAVE_CLAMDBLAS THEN YES ELSE NO)
@ -1255,3 +1335,7 @@ endif()
 # ----------------------------------------------------------------------------

 include(cmake/OpenCVPackaging.cmake)
+
+# This should be the last command
+ocv_cmake_dump_vars("" TOFILE "CMakeVars.txt")
+ocv_cmake_eval(DEBUG_POST ONCE)
--- a/8
+++ b/8
@ -7,12 +7,12 @@ copy or use the software.
               For Open Source Computer Vision Library
                       (3-clause BSD License)

-Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
+Copyright (C) 2000-2016, Intel Corporation, all rights reserved.
 Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
-Copyright (C) 2009-2015, NVIDIA Corporation, all rights reserved.
+Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
 Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-Copyright (C) 2015, OpenCV Foundation, all rights reserved.
-Copyright (C) 2015, Itseez Inc., all rights reserved.
+Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved.
+Copyright (C) 2015-2016, Itseez Inc., all rights reserved.
 Third party copyrights are property of their respective owners.

 Redistribution and use in source and binary forms, with or without modification,
--- a/README.md
+++ b/README.md
@ -1,7 +1,5 @@
 ### OpenCV: Open Source Computer Vision Library

-[![Gittip](http://img.shields.io/gittip/OpenCV.png)](https://www.gittip.com/OpenCV/)
-
 #### Resources

 * Homepage: <http://opencv.org>
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@ -4,3 +4,4 @@ link_libraries(${OPENCV_LINKER_LIBS})
 add_subdirectory(traincascade)
 add_subdirectory(createsamples)
 add_subdirectory(annotation)
+add_subdirectory(visualisation)
--- a/apps/annotation/CMakeLists.txt
+++ b/apps/annotation/CMakeLists.txt
@ -21,7 +21,6 @@ set_target_properties(${the_target} PROPERTIES
                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                      INSTALL_NAME_DIR lib
                      OUTPUT_NAME "opencv_annotation")

 if(ENABLE_SOLUTION_FOLDERS)
--- a/apps/annotation/opencv_annotation.cpp
+++ b/apps/annotation/opencv_annotation.cpp
@ -46,6 +46,9 @@ USAGE:
 ./opencv_annotation -images <folder location> -annotations <ouput file>

 Created by: Puttemans Steven - February 2015
+Adapted by: Puttemans Steven - April 2016 - Vectorize the process to enable better processing
+                                               + early leave and store by pressing an ESC key
+                                               + enable delete `d` button, to remove last annotation
 *****************************************************************************************************/

 #include <opencv2/core.hpp>
@ -68,16 +71,15 @@ using namespace cv;

 // Function prototypes
 void on_mouse(int, int, int, int, void*);
-string int2string(int);
-void get_annotations(Mat, stringstream*);
+vector<Rect> get_annotations(Mat);

 // Public parameters
 Mat image;
 int roi_x0 = 0, roi_y0 = 0, roi_x1 = 0, roi_y1 = 0, num_of_rec = 0;
-bool start_draw = false;
+bool start_draw = false, stop = false;

 // Window name for visualisation purposes
-const string window_name="OpenCV Based Annotation Tool";
+const string window_name = "OpenCV Based Annotation Tool";

 // FUNCTION : Mouse response for selecting objects in images
 // If left button is clicked, start drawing a rectangle as long as mouse moves
@ -98,7 +100,8 @@ void on_mouse(int event, int x, int y, int , void * )
            start_draw = false;
        }
    }
-    // Action when mouse is moving
+
+    // Action when mouse is moving and drawing is enabled
    if((event == EVENT_MOUSEMOVE) && start_draw)
    {
        // Redraw bounding box for annotation
@ -109,42 +112,34 @@ void on_mouse(int event, int x, int y, int , void * )
    }
 }

-// FUNCTION : snippet to convert an integer value to a string using a clean function
-// instead of creating a stringstream each time inside the main code
-string int2string(int num)
+// FUNCTION : returns a vector of Rect objects given an image containing positive object instances
+vector<Rect> get_annotations(Mat input_image)
 {
-    stringstream temp_stream;
-    temp_stream << num;
-    return temp_stream.str();
-}
+    vector<Rect> current_annotations;

-// FUNCTION : given an image containing positive object instances, add all the object
-// annotations to a known stringstream
-void get_annotations(Mat input_image, stringstream* output_stream)
-{
-    // Make it possible to exit the annotation
-    bool stop = false;
-
-    // Reset the num_of_rec element at each iteration
-    // Make sure the global image is set to the current image
-    num_of_rec = 0;
-    image = input_image;
+    // Make it possible to exit the annotation process
+    stop = false;

    // Init window interface and couple mouse actions
    namedWindow(window_name, WINDOW_AUTOSIZE);
    setMouseCallback(window_name, on_mouse);

+    image = input_image;
    imshow(window_name, image);
-    stringstream temp_stream;
    int key_pressed = 0;

    do
    {
+        // Get a temporary image clone
+        Mat temp_image = input_image.clone();
+        Rect currentRect(0, 0, 0, 0);
+
        // Keys for processing
        // You need to select one for confirming a selection and one to continue to the next image
        // Based on the universal ASCII code of the keystroke: http://www.asciitable.com/
        //      c = 99		    add rectangle to current image
        //	    n = 110		    save added rectangles and show next image
+        //      d = 100         delete the last annotation made
        //	    <ESC> = 27      exit program
        key_pressed = 0xFF & waitKey(0);
        switch( key_pressed )
@ -152,32 +147,53 @@ void get_annotations(Mat input_image, stringstream* output_stream)
        case 27:
                destroyWindow(window_name);
                stop = true;
+                break;
        case 99:
-                // Add a rectangle to the list
-                num_of_rec++;
                // Draw initiated from top left corner
                if(roi_x0<roi_x1 && roi_y0<roi_y1)
                {
-                    temp_stream << " " << int2string(roi_x0) << " " << int2string(roi_y0) << " " << int2string(roi_x1-roi_x0) << " " << int2string(roi_y1-roi_y0);
+                    currentRect.x = roi_x0;
+                    currentRect.y = roi_y0;
+                    currentRect.width = roi_x1-roi_x0;
+                    currentRect.height = roi_y1-roi_y0;
                }
                // Draw initiated from bottom right corner
                if(roi_x0>roi_x1 && roi_y0>roi_y1)
                {
-                    temp_stream << " " << int2string(roi_x1) << " " << int2string(roi_y1) << " " << int2string(roi_x0-roi_x1) << " " << int2string(roi_y0-roi_y1);
+                    currentRect.x = roi_x1;
+                    currentRect.y = roi_y1;
+                    currentRect.width = roi_x0-roi_x1;
+                    currentRect.height = roi_y0-roi_y1;
                }
                // Draw initiated from top right corner
                if(roi_x0>roi_x1 && roi_y0<roi_y1)
                {
-                    temp_stream << " " << int2string(roi_x1) << " " << int2string(roi_y0) << " " << int2string(roi_x0-roi_x1) << " " << int2string(roi_y1-roi_y0);
+                    currentRect.x = roi_x1;
+                    currentRect.y = roi_y0;
+                    currentRect.width = roi_x0-roi_x1;
+                    currentRect.height = roi_y1-roi_y0;
                }
                // Draw initiated from bottom left corner
                if(roi_x0<roi_x1 && roi_y0>roi_y1)
                {
-                    temp_stream << " " << int2string(roi_x0) << " " << int2string(roi_y1) << " " << int2string(roi_x1-roi_x0) << " " << int2string(roi_y0-roi_y1);
+                    currentRect.x = roi_x0;
+                    currentRect.y = roi_y1;
+                    currentRect.width = roi_x1-roi_x0;
+                    currentRect.height = roi_y0-roi_y1;
                }
-
-                rectangle(input_image, Point(roi_x0,roi_y0), Point(roi_x1,roi_y1), Scalar(0,255,0), 1);
-
+                // Draw the rectangle on the canvas
+                // Add the rectangle to the vector of annotations
+                current_annotations.push_back(currentRect);
+                break;
+        case 100:
+                // Remove the last annotation
+                if(current_annotations.size() > 0){
+                    current_annotations.pop_back();
+                }
+                break;
+        default:
+                // Default case --> do nothing at all
+                // Other keystrokes can simply be ignored
                break;
        }

@ -186,19 +202,24 @@ void get_annotations(Mat input_image, stringstream* output_stream)
        {
            break;
        }
+
+        // Draw all the current rectangles onto the top image and make sure that the global image is linked
+        for(int i=0; i < (int)current_annotations.size(); i++){
+            rectangle(temp_image, current_annotations[i], Scalar(0,255,0), 1);
+        }
+        image = temp_image;
+
+        // Force an explicit redraw of the canvas --> necessary to visualize delete correctly
+        imshow(window_name, image);
    }
    // Continue as long as the next image key has not been pressed
    while(key_pressed != 110);

-    // If there are annotations AND the next image key is pressed
-    // Write the image annotations to the file
-    if(num_of_rec>0 && key_pressed==110)
-    {
-        *output_stream << " " << num_of_rec << temp_stream.str() << endl;
-    }
-
    // Close down the window
    destroyWindow(window_name);
+
+    // Return the data
+    return current_annotations;
 }

 int main( int argc, const char** argv )
@ -208,13 +229,13 @@ int main( int argc, const char** argv )
        cout << "Usage: " << argv[0] << endl;
        cout << " -images <folder_location> [example - /data/testimages/]" << endl;
        cout << " -annotations <ouput_file> [example - /data/annotations.txt]" << endl;
-
+        cout << "TIP: Use absolute paths to avoid any problems with the software!" << endl;
        return -1;
    }

    // Read in the input arguments
    string image_folder;
-    string annotations;
+    string annotations_file;
    for(int i = 1; i < argc; ++i )
    {
        if( !strcmp( argv[i], "-images" ) )
@ -223,7 +244,7 @@ int main( int argc, const char** argv )
        }
        else if( !strcmp( argv[i], "-annotations" ) )
        {
-            annotations = argv[++i];
+            annotations_file = argv[++i];
        }
    }

@ -248,14 +269,9 @@ int main( int argc, const char** argv )
    }
    #endif

-    // Create the outputfilestream
-    ofstream output(annotations.c_str());
-    if ( !output.is_open() ){
-        cerr << "The path for the output file contains an error and could not be opened. Please check again!" << endl;
-        return 0;
-    }
-
+    // Start by processing the data
    // Return the image filenames inside the image folder
+    vector< vector<Rect> > annotations;
    vector<String> filenames;
    String folder(image_folder);
    glob(folder, filenames);
@ -273,15 +289,33 @@ int main( int argc, const char** argv )
            continue;
        }

-        // Perform annotations & generate corresponding output
-        stringstream output_stream;
-        get_annotations(current_image, &output_stream);
+        // Perform annotations & store the result inside the vectorized structure
+        vector<Rect> current_annotations = get_annotations(current_image);
+        annotations.push_back(current_annotations);

-        // Store the annotations, write to the output file
-        if (output_stream.str() != ""){
-            output << filenames[i] << output_stream.str();
+        // Check if the ESC key was hit, then exit earlier then expected
+        if(stop){
+            break;
        }
    }

+    // When all data is processed, store the data gathered inside the proper file
+    // This now even gets called when the ESC button was hit to store preliminary results
+    ofstream output(annotations_file.c_str());
+    if ( !output.is_open() ){
+        cerr << "The path for the output file contains an error and could not be opened. Please check again!" << endl;
+        return 0;
+    }
+
+    // Store the annotations, write to the output file
+    for(int i = 0; i < (int)annotations.size(); i++){
+        output << filenames[i] << " " << annotations[i].size();
+        for(int j=0; j < (int)annotations[i].size(); j++){
+            Rect temp = annotations[i][j];
+            output << " " << temp.x << " " << temp.y << " " << temp.width << " " << temp.height;
+        }
+        output << endl;
+    }
+
    return 0;
 }
--- a/apps/createsamples/CMakeLists.txt
+++ b/apps/createsamples/CMakeLists.txt
@ -23,7 +23,6 @@ set_target_properties(${the_target} PROPERTIES
                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                      INSTALL_NAME_DIR lib
                      OUTPUT_NAME "opencv_createsamples")

 if(ENABLE_SOLUTION_FOLDERS)
--- a/apps/interactive-calibration/CMakeLists.txt
+++ b/apps/interactive-calibration/CMakeLists.txt
@ -0,0 +1,48 @@
+set(OPENCV_INTERACTIVECALIBRATION_DEPS opencv_core opencv_aruco opencv_highgui opencv_calib3d opencv_videoio)
+ocv_check_dependencies(${OPENCV_INTERACTIVECALIBRATION_DEPS})
+
+if(NOT OCV_DEPENDENCIES_FOUND)
+  return()
+endif()
+
+find_package(LAPACK)
+if(LAPACK_FOUND)
+  find_file(LAPACK_HEADER "lapacke.h")
+  if(LAPACK_HEADER)
+    add_definitions(-DUSE_LAPACK)
+    link_libraries(${LAPACK_LIBRARIES})
+  endif()
+endif()
+
+project(interactive-calibration)
+set(the_target opencv_interactive-calibration)
+
+ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
+ocv_target_include_modules_recurse(${the_target} ${OPENCV_INTERACTIVECALIBRATION_DEPS})
+
+file(GLOB SRCS *.cpp)
+file(GLOB HDRS *.h*)
+
+set(interactive-calibration_files ${SRCS} ${HDRS})
+
+ocv_add_executable(${the_target} ${interactive-calibration_files})
+ocv_target_link_libraries(${the_target} ${OPENCV_INTERACTIVECALIBRATION_DEPS})
+
+set_target_properties(${the_target} PROPERTIES
+                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
+                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
+                      INSTALL_NAME_DIR lib
+                    OUTPUT_NAME "opencv_interactive-calibration")
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(${the_target} PROPERTIES FOLDER "applications")
+endif()
+
+if(INSTALL_CREATE_DISTRIB)
+  if(BUILD_SHARED_LIBS)
+    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
+  endif()
+else()
+  install(TARGETS ${the_target} OPTIONAL RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
+endif()
--- a/apps/interactive-calibration/calibCommon.hpp
+++ b/apps/interactive-calibration/calibCommon.hpp
@ -0,0 +1,118 @@
+#ifndef CALIB_COMMON_HPP
+#define CALIB_COMMON_HPP
+
+#include <memory>
+#include <opencv2/core.hpp>
+#include <vector>
+#include <string>
+
+namespace calib
+{
+    #define OVERLAY_DELAY 1000
+    #define IMAGE_MAX_WIDTH 1280
+    #define IMAGE_MAX_HEIGHT 960
+
+    bool showOverlayMessage(const std::string& message);
+
+    enum InputType { Video, Pictures };
+    enum InputVideoSource { Camera, File };
+    enum TemplateType { AcirclesGrid, Chessboard, chAruco, DoubleAcirclesGrid };
+
+    static const std::string mainWindowName = "Calibration";
+    static const std::string gridWindowName = "Board locations";
+    static const std::string consoleHelp = "Hot keys:\nesc - exit application\n"
+                              "s - save current data to .xml file\n"
+                              "r - delete last frame\n"
+                              "u - enable/disable applying undistortion"
+                              "d - delete all frames\n"
+                              "v - switch visualization";
+
+    static const double sigmaMult = 1.96;
+
+    struct calibrationData
+    {
+        cv::Mat cameraMatrix;
+        cv::Mat distCoeffs;
+        cv::Mat stdDeviations;
+        cv::Mat perViewErrors;
+        std::vector<cv::Mat> rvecs;
+        std::vector<cv::Mat> tvecs;
+        double totalAvgErr;
+        cv::Size imageSize;
+
+        std::vector<std::vector<cv::Point2f> > imagePoints;
+        std::vector< std::vector<cv::Point3f> > objectPoints;
+
+        std::vector<cv::Mat> allCharucoCorners;
+        std::vector<cv::Mat> allCharucoIds;
+
+        cv::Mat undistMap1, undistMap2;
+
+        calibrationData()
+        {
+            imageSize = cv::Size(IMAGE_MAX_WIDTH, IMAGE_MAX_HEIGHT);
+        }
+    };
+
+    struct cameraParameters
+    {
+        cv::Mat cameraMatrix;
+        cv::Mat distCoeffs;
+        cv::Mat stdDeviations;
+        double avgError;
+
+        cameraParameters(){}
+        cameraParameters(cv::Mat& _cameraMatrix, cv::Mat& _distCoeffs, cv::Mat& _stdDeviations, double _avgError = 0) :
+            cameraMatrix(_cameraMatrix), distCoeffs(_distCoeffs), stdDeviations(_stdDeviations), avgError(_avgError)
+        {}
+    };
+
+    struct captureParameters
+    {
+        InputType captureMethod;
+        InputVideoSource source;
+        TemplateType board;
+        cv::Size boardSize;
+        int charucoDictName;
+        int calibrationStep;
+        float charucoSquareLenght, charucoMarkerSize;
+        float captureDelay;
+        float squareSize;
+        float templDst;
+        std::string videoFileName;
+        bool flipVertical;
+        int camID;
+        int fps;
+        cv::Size cameraResolution;
+        int maxFramesNum;
+        int minFramesNum;
+
+        captureParameters()
+        {
+            calibrationStep = 1;
+            captureDelay = 500.f;
+            maxFramesNum = 30;
+            minFramesNum = 10;
+            fps = 30;
+            cameraResolution = cv::Size(IMAGE_MAX_WIDTH, IMAGE_MAX_HEIGHT);
+        }
+    };
+
+    struct internalParameters
+    {
+        double solverEps;
+        int solverMaxIters;
+        bool fastSolving;
+        double filterAlpha;
+
+        internalParameters()
+        {
+            solverEps = 1e-7;
+            solverMaxIters = 30;
+            fastSolving = false;
+            filterAlpha = 0.1;
+        }
+    };
+}
+
+#endif
--- a/apps/interactive-calibration/calibController.cpp
+++ b/apps/interactive-calibration/calibController.cpp
@ -0,0 +1,327 @@
+#include "calibController.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <ctime>
+#include <opencv2/calib3d.hpp>
+#include <opencv2/imgproc.hpp>
+
+double calib::calibController::estimateCoverageQuality()
+{
+    int gridSize = 10;
+    int xGridStep = mCalibData->imageSize.width / gridSize;
+    int yGridStep = mCalibData->imageSize.height / gridSize;
+    std::vector<int> pointsInCell(gridSize*gridSize);
+
+    std::fill(pointsInCell.begin(), pointsInCell.end(), 0);
+
+    for(std::vector<std::vector<cv::Point2f> >::iterator it = mCalibData->imagePoints.begin(); it != mCalibData->imagePoints.end(); ++it)
+        for(std::vector<cv::Point2f>::iterator pointIt = (*it).begin(); pointIt != (*it).end(); ++pointIt) {
+            int i = (int)((*pointIt).x / xGridStep);
+            int j = (int)((*pointIt).y / yGridStep);
+            pointsInCell[i*gridSize + j]++;
+        }
+
+    for(std::vector<cv::Mat>::iterator it = mCalibData->allCharucoCorners.begin(); it != mCalibData->allCharucoCorners.end(); ++it)
+        for(int l = 0; l < (*it).size[0]; l++) {
+            int i = (int)((*it).at<float>(l, 0) / xGridStep);
+            int j = (int)((*it).at<float>(l, 1) / yGridStep);
+            pointsInCell[i*gridSize + j]++;
+        }
+
+    cv::Mat mean, stdDev;
+    cv::meanStdDev(pointsInCell, mean, stdDev);
+
+    return mean.at<double>(0) / (stdDev.at<double>(0) + 1e-7);
+}
+
+calib::calibController::calibController()
+{
+    mCalibFlags = 0;
+}
+
+calib::calibController::calibController(cv::Ptr<calib::calibrationData> data, int initialFlags, bool autoTuning, int minFramesNum) :
+    mCalibData(data)
+{
+    mCalibFlags = initialFlags;
+    mNeedTuning = autoTuning;
+    mMinFramesNum = minFramesNum;
+    mConfIntervalsState = false;
+    mCoverageQualityState = false;
+}
+
+void calib::calibController::updateState()
+{
+    if(mCalibData->cameraMatrix.total()) {
+        const double relErrEps = 0.05;
+        bool fConfState = false, cConfState = false, dConfState = true;
+        if(sigmaMult*mCalibData->stdDeviations.at<double>(0) / mCalibData->cameraMatrix.at<double>(0,0) < relErrEps &&
+                sigmaMult*mCalibData->stdDeviations.at<double>(1) / mCalibData->cameraMatrix.at<double>(1,1) < relErrEps)
+            fConfState = true;
+        if(sigmaMult*mCalibData->stdDeviations.at<double>(2) / mCalibData->cameraMatrix.at<double>(0,2) < relErrEps &&
+                sigmaMult*mCalibData->stdDeviations.at<double>(3) / mCalibData->cameraMatrix.at<double>(1,2) < relErrEps)
+            cConfState = true;
+
+        for(int i = 0; i < 5; i++)
+            if(mCalibData->stdDeviations.at<double>(4+i) / fabs(mCalibData->distCoeffs.at<double>(i)) > 1)
+                dConfState = false;
+
+        mConfIntervalsState = fConfState && cConfState && dConfState;
+    }
+
+    if(getFramesNumberState())
+        mCoverageQualityState = estimateCoverageQuality() > 1.8 ? true : false;
+
+    if (getFramesNumberState() && mNeedTuning) {
+        if( !(mCalibFlags & cv::CALIB_FIX_ASPECT_RATIO) &&
+            mCalibData->cameraMatrix.total()) {
+            double fDiff = fabs(mCalibData->cameraMatrix.at<double>(0,0) -
+                                mCalibData->cameraMatrix.at<double>(1,1));
+
+            if (fDiff < 3*mCalibData->stdDeviations.at<double>(0) &&
+                    fDiff < 3*mCalibData->stdDeviations.at<double>(1)) {
+                mCalibFlags |= cv::CALIB_FIX_ASPECT_RATIO;
+                mCalibData->cameraMatrix.at<double>(0,0) =
+                        mCalibData->cameraMatrix.at<double>(1,1);
+            }
+        }
+
+        if(!(mCalibFlags & cv::CALIB_ZERO_TANGENT_DIST)) {
+            const double eps = 0.005;
+            if(fabs(mCalibData->distCoeffs.at<double>(2)) < eps &&
+                    fabs(mCalibData->distCoeffs.at<double>(3)) < eps)
+                mCalibFlags |= cv::CALIB_ZERO_TANGENT_DIST;
+        }
+
+        if(!(mCalibFlags & cv::CALIB_FIX_K1)) {
+            const double eps = 0.005;
+            if(fabs(mCalibData->distCoeffs.at<double>(0)) < eps)
+                mCalibFlags |= cv::CALIB_FIX_K1;
+        }
+
+        if(!(mCalibFlags & cv::CALIB_FIX_K2)) {
+            const double eps = 0.005;
+            if(fabs(mCalibData->distCoeffs.at<double>(1)) < eps)
+                mCalibFlags |= cv::CALIB_FIX_K2;
+        }
+
+        if(!(mCalibFlags & cv::CALIB_FIX_K3)) {
+            const double eps = 0.005;
+            if(fabs(mCalibData->distCoeffs.at<double>(4)) < eps)
+                mCalibFlags |= cv::CALIB_FIX_K3;
+        }
+
+    }
+}
+
+bool calib::calibController::getCommonCalibrationState() const
+{
+    int rating = (int)getFramesNumberState() + (int)getConfidenceIntrervalsState() +
+            (int)getRMSState() + (int)mCoverageQualityState;
+    return rating == 4;
+}
+
+bool calib::calibController::getFramesNumberState() const
+{
+    return std::max(mCalibData->imagePoints.size(), mCalibData->allCharucoCorners.size()) > mMinFramesNum;
+}
+
+bool calib::calibController::getConfidenceIntrervalsState() const
+{
+    return mConfIntervalsState;
+}
+
+bool calib::calibController::getRMSState() const
+{
+    return mCalibData->totalAvgErr < 0.5;
+}
+
+int calib::calibController::getNewFlags() const
+{
+    return mCalibFlags;
+}
+
+
+//////////////////// calibDataController
+
+double calib::calibDataController::estimateGridSubsetQuality(size_t excludedIndex)
+{
+    {
+        int gridSize = 10;
+        int xGridStep = mCalibData->imageSize.width / gridSize;
+        int yGridStep = mCalibData->imageSize.height / gridSize;
+        std::vector<int> pointsInCell(gridSize*gridSize);
+
+        std::fill(pointsInCell.begin(), pointsInCell.end(), 0);
+
+        for(size_t k = 0; k < mCalibData->imagePoints.size(); k++)
+            if(k != excludedIndex)
+                for(std::vector<cv::Point2f>::iterator pointIt = mCalibData->imagePoints[k].begin(); pointIt != mCalibData->imagePoints[k].end(); ++pointIt) {
+                    int i = (int)((*pointIt).x / xGridStep);
+                    int j = (int)((*pointIt).y / yGridStep);
+                    pointsInCell[i*gridSize + j]++;
+                }
+
+        for(size_t k = 0; k < mCalibData->allCharucoCorners.size(); k++)
+            if(k != excludedIndex)
+                for(int l = 0; l <  mCalibData->allCharucoCorners[k].size[0]; l++) {
+                    int i = (int)(mCalibData->allCharucoCorners[k].at<float>(l, 0) / xGridStep);
+                    int j = (int)(mCalibData->allCharucoCorners[k].at<float>(l, 1) / yGridStep);
+                    pointsInCell[i*gridSize + j]++;
+                }
+
+        cv::Mat mean, stdDev;
+        cv::meanStdDev(pointsInCell, mean, stdDev);
+
+        return mean.at<double>(0) / (stdDev.at<double>(0) + 1e-7);
+    }
+}
+
+calib::calibDataController::calibDataController(cv::Ptr<calib::calibrationData> data, int maxFrames, double convParameter) :
+    mCalibData(data), mParamsFileName("CamParams.xml")
+{
+    mMaxFramesNum = maxFrames;
+    mAlpha = convParameter;
+}
+
+calib::calibDataController::calibDataController()
+{
+
+}
+
+void calib::calibDataController::filterFrames()
+{
+    size_t numberOfFrames = std::max(mCalibData->allCharucoIds.size(), mCalibData->imagePoints.size());
+    CV_Assert(numberOfFrames == mCalibData->perViewErrors.total());
+    if(numberOfFrames >= mMaxFramesNum) {
+
+        double worstValue = -HUGE_VAL, maxQuality = estimateGridSubsetQuality(numberOfFrames);
+        size_t worstElemIndex = 0;
+        for(size_t i = 0; i < numberOfFrames; i++) {
+            double gridQDelta = estimateGridSubsetQuality(i) - maxQuality;
+            double currentValue = mCalibData->perViewErrors.at<double>((int)i)*mAlpha + gridQDelta*(1. - mAlpha);
+            if(currentValue > worstValue) {
+                worstValue = currentValue;
+                worstElemIndex = i;
+            }
+        }
+        showOverlayMessage(cv::format("Frame %d is worst", worstElemIndex + 1));
+
+        if(mCalibData->imagePoints.size()) {
+            mCalibData->imagePoints.erase(mCalibData->imagePoints.begin() + worstElemIndex);
+            mCalibData->objectPoints.erase(mCalibData->objectPoints.begin() + worstElemIndex);
+        }
+        else {
+            mCalibData->allCharucoCorners.erase(mCalibData->allCharucoCorners.begin() + worstElemIndex);
+            mCalibData->allCharucoIds.erase(mCalibData->allCharucoIds.begin() + worstElemIndex);
+        }
+
+        cv::Mat newErrorsVec = cv::Mat((int)numberOfFrames - 1, 1, CV_64F);
+        std::copy(mCalibData->perViewErrors.ptr<double>(0),
+                  mCalibData->perViewErrors.ptr<double>((int)worstElemIndex), newErrorsVec.ptr<double>(0));
+        std::copy(mCalibData->perViewErrors.ptr<double>((int)worstElemIndex + 1), mCalibData->perViewErrors.ptr<double>((int)numberOfFrames),
+                    newErrorsVec.ptr<double>((int)worstElemIndex));
+        mCalibData->perViewErrors = newErrorsVec;
+    }
+}
+
+void calib::calibDataController::setParametersFileName(const std::string &name)
+{
+    mParamsFileName = name;
+}
+
+void calib::calibDataController::deleteLastFrame()
+{
+    if( !mCalibData->imagePoints.empty()) {
+        mCalibData->imagePoints.pop_back();
+        mCalibData->objectPoints.pop_back();
+    }
+
+    if (!mCalibData->allCharucoCorners.empty()) {
+        mCalibData->allCharucoCorners.pop_back();
+        mCalibData->allCharucoIds.pop_back();
+    }
+
+    if(!mParamsStack.empty()) {
+        mCalibData->cameraMatrix = (mParamsStack.top()).cameraMatrix;
+        mCalibData->distCoeffs = (mParamsStack.top()).distCoeffs;
+        mCalibData->stdDeviations = (mParamsStack.top()).stdDeviations;
+        mCalibData->totalAvgErr = (mParamsStack.top()).avgError;
+        mParamsStack.pop();
+    }
+}
+
+void calib::calibDataController::rememberCurrentParameters()
+{
+    cv::Mat oldCameraMat, oldDistcoeefs, oldStdDevs;
+    mCalibData->cameraMatrix.copyTo(oldCameraMat);
+    mCalibData->distCoeffs.copyTo(oldDistcoeefs);
+    mCalibData->stdDeviations.copyTo(oldStdDevs);
+    mParamsStack.push(cameraParameters(oldCameraMat, oldDistcoeefs, oldStdDevs, mCalibData->totalAvgErr));
+}
+
+void calib::calibDataController::deleteAllData()
+{
+    mCalibData->imagePoints.clear();
+    mCalibData->objectPoints.clear();
+    mCalibData->allCharucoCorners.clear();
+    mCalibData->allCharucoIds.clear();
+    mCalibData->cameraMatrix = mCalibData->distCoeffs = cv::Mat();
+    mParamsStack = std::stack<cameraParameters>();
+    rememberCurrentParameters();
+}
+
+bool calib::calibDataController::saveCurrentCameraParameters() const
+{
+    bool success = false;
+    if(mCalibData->cameraMatrix.total()) {
+            cv::FileStorage parametersWriter(mParamsFileName, cv::FileStorage::WRITE);
+            if(parametersWriter.isOpened()) {
+                time_t rawtime;
+                time(&rawtime);
+                char buf[256];
+                strftime(buf, sizeof(buf)-1, "%c", localtime(&rawtime));
+
+                parametersWriter << "calibrationDate" << buf;
+                parametersWriter << "framesCount" << std::max((int)mCalibData->objectPoints.size(), (int)mCalibData->allCharucoCorners.size());
+                parametersWriter << "cameraResolution" << mCalibData->imageSize;
+                parametersWriter << "cameraMatrix" << mCalibData->cameraMatrix;
+                parametersWriter << "cameraMatrix_std_dev" << mCalibData->stdDeviations.rowRange(cv::Range(0, 4));
+                parametersWriter << "dist_coeffs" << mCalibData->distCoeffs;
+                parametersWriter << "dist_coeffs_std_dev" << mCalibData->stdDeviations.rowRange(cv::Range(4, 9));
+                parametersWriter << "avg_reprojection_error" << mCalibData->totalAvgErr;
+
+                parametersWriter.release();
+                success = true;
+        }
+    }
+    return success;
+}
+
+void calib::calibDataController::printParametersToConsole(std::ostream &output) const
+{
+    const char* border = "---------------------------------------------------";
+    output << border << std::endl;
+    output << "Frames used for calibration: " << std::max(mCalibData->objectPoints.size(), mCalibData->allCharucoCorners.size())
+           << " \t RMS = " << mCalibData->totalAvgErr << std::endl;
+    if(mCalibData->cameraMatrix.at<double>(0,0) == mCalibData->cameraMatrix.at<double>(1,1))
+        output << "F = " << mCalibData->cameraMatrix.at<double>(1,1) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(1) << std::endl;
+    else
+        output << "Fx = " << mCalibData->cameraMatrix.at<double>(0,0) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(0) << " \t "
+               << "Fy = " << mCalibData->cameraMatrix.at<double>(1,1) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(1) << std::endl;
+    output << "Cx = " << mCalibData->cameraMatrix.at<double>(0,2) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(2) << " \t"
+           << "Cy = " << mCalibData->cameraMatrix.at<double>(1,2) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(3) << std::endl;
+    output << "K1 = " << mCalibData->distCoeffs.at<double>(0) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(4) << std::endl;
+    output << "K2 = " << mCalibData->distCoeffs.at<double>(1) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(5) << std::endl;
+    output << "K3 = " << mCalibData->distCoeffs.at<double>(4) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(8) << std::endl;
+    output << "TD1 = " << mCalibData->distCoeffs.at<double>(2) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(6) << std::endl;
+    output << "TD2 = " << mCalibData->distCoeffs.at<double>(3) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(7) << std::endl;
+}
+
+void calib::calibDataController::updateUndistortMap()
+{
+    cv::initUndistortRectifyMap(mCalibData->cameraMatrix, mCalibData->distCoeffs, cv::noArray(),
+                                cv::getOptimalNewCameraMatrix(mCalibData->cameraMatrix, mCalibData->distCoeffs, mCalibData->imageSize, 0.0, mCalibData->imageSize),
+                                mCalibData->imageSize, CV_16SC2, mCalibData->undistMap1, mCalibData->undistMap2);
+
+}
--- a/apps/interactive-calibration/calibController.hpp
+++ b/apps/interactive-calibration/calibController.hpp
@ -0,0 +1,64 @@
+#ifndef CALIB_CONTROLLER_HPP
+#define CALIB_CONTROLLER_HPP
+
+#include "calibCommon.hpp"
+#include <stack>
+#include <string>
+#include <ostream>
+
+namespace calib {
+
+    class calibController
+    {
+    protected:
+        cv::Ptr<calibrationData> mCalibData;
+        int mCalibFlags;
+        unsigned mMinFramesNum;
+        bool mNeedTuning;
+        bool mConfIntervalsState;
+        bool mCoverageQualityState;
+
+        double estimateCoverageQuality();
+    public:
+        calibController();
+        calibController(cv::Ptr<calibrationData> data, int initialFlags, bool autoTuning,
+                        int minFramesNum);
+
+        void updateState();
+
+        bool getCommonCalibrationState() const;
+
+        bool getFramesNumberState() const;
+        bool getConfidenceIntrervalsState() const;
+        bool getRMSState() const;
+        bool getPointsCoverageState() const;
+        int getNewFlags() const;
+    };
+
+    class calibDataController
+    {
+    protected:
+        cv::Ptr<calibrationData> mCalibData;
+        std::stack<cameraParameters> mParamsStack;
+        std::string mParamsFileName;
+        unsigned mMaxFramesNum;
+        double mAlpha;
+
+        double estimateGridSubsetQuality(size_t excludedIndex);
+    public:
+        calibDataController(cv::Ptr<calibrationData> data, int maxFrames, double convParameter);
+        calibDataController();
+
+        void filterFrames();
+        void setParametersFileName(const std::string& name);
+        void deleteLastFrame();
+        void rememberCurrentParameters();
+        void deleteAllData();
+        bool saveCurrentCameraParameters() const;
+        void printParametersToConsole(std::ostream &output) const;
+        void updateUndistortMap();
+    };
+
+}
+
+#endif
--- a/apps/interactive-calibration/calibPipeline.cpp
+++ b/apps/interactive-calibration/calibPipeline.cpp
@ -0,0 +1,91 @@
+#include "calibPipeline.hpp"
+#include <opencv2/highgui.hpp>
+#include <stdexcept>
+
+using namespace calib;
+
+#define CAP_DELAY 10
+
+cv::Size CalibPipeline::getCameraResolution()
+{
+    mCapture.set(cv::CAP_PROP_FRAME_WIDTH, 10000);
+    mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, 10000);
+    int w = (int)mCapture.get(cv::CAP_PROP_FRAME_WIDTH);
+    int h = (int)mCapture.get(cv::CAP_PROP_FRAME_HEIGHT);
+    return cv::Size(w,h);
+}
+
+CalibPipeline::CalibPipeline(captureParameters params) :
+    mCaptureParams(params)
+{
+
+}
+
+PipelineExitStatus CalibPipeline::start(std::vector<cv::Ptr<FrameProcessor> > processors)
+{
+    if(mCaptureParams.source == Camera && !mCapture.isOpened())
+    {
+        mCapture.open(mCaptureParams.camID);
+        cv::Size maxRes = getCameraResolution();
+        cv::Size neededRes = mCaptureParams.cameraResolution;
+
+        if(maxRes.width < neededRes.width) {
+            double aR = (double)maxRes.width / maxRes.height;
+            mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.width);
+            mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.width/aR);
+        }
+        else if(maxRes.height < neededRes.height) {
+            double aR = (double)maxRes.width / maxRes.height;
+            mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.height);
+            mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.height*aR);
+        }
+        else {
+            mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.height);
+            mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.width);
+        }
+        mCapture.set(cv::CAP_PROP_AUTOFOCUS, 0);
+    }
+    else if (mCaptureParams.source == File && !mCapture.isOpened())
+        mCapture.open(mCaptureParams.videoFileName);
+    mImageSize = cv::Size((int)mCapture.get(cv::CAP_PROP_FRAME_WIDTH), (int)mCapture.get(cv::CAP_PROP_FRAME_HEIGHT));
+
+    if(!mCapture.isOpened())
+        throw std::runtime_error("Unable to open video source");
+
+    cv::Mat frame, processedFrame;
+    while(mCapture.grab()) {
+        mCapture.retrieve(frame);
+        if(mCaptureParams.flipVertical)
+            cv::flip(frame, frame, -1);
+
+        frame.copyTo(processedFrame);
+        for (std::vector<cv::Ptr<FrameProcessor> >::iterator it = processors.begin(); it != processors.end(); ++it)
+            processedFrame = (*it)->processFrame(processedFrame);
+        cv::imshow(mainWindowName, processedFrame);
+        int key = cv::waitKey(CAP_DELAY);
+
+        if(key == 27) // esc
+            return Finished;
+        else if (key == 114) // r
+            return DeleteLastFrame;
+        else if (key == 100) // d
+            return DeleteAllFrames;
+        else if (key == 115) // s
+            return SaveCurrentData;
+        else if (key == 117) // u
+            return SwitchUndistort;
+        else if (key == 118) // v
+            return SwitchVisualisation;
+
+        for (std::vector<cv::Ptr<FrameProcessor> >::iterator it = processors.begin(); it != processors.end(); ++it)
+            if((*it)->isProcessed())
+                return Calibrate;
+    }
+
+    return Finished;
+}
+
+cv::Size CalibPipeline::getImageSize() const
+{
+    return mImageSize;
+}
--- a/apps/interactive-calibration/calibPipeline.hpp
+++ b/apps/interactive-calibration/calibPipeline.hpp
@ -0,0 +1,39 @@
+#ifndef CALIB_PIPELINE_HPP
+#define CALIB_PIPELINE_HPP
+
+#include <vector>
+#include <opencv2/highgui.hpp>
+
+#include "calibCommon.hpp"
+#include "frameProcessor.hpp"
+
+namespace calib
+{
+
+enum PipelineExitStatus { Finished,
+                                DeleteLastFrame,
+                                Calibrate,
+                                DeleteAllFrames,
+                                SaveCurrentData,
+                                SwitchUndistort,
+                                SwitchVisualisation
+                              };
+
+class CalibPipeline
+{
+protected:
+    captureParameters mCaptureParams;
+    cv::Size mImageSize;
+    cv::VideoCapture mCapture;
+
+    cv::Size getCameraResolution();
+
+public:
+    CalibPipeline(captureParameters params);
+    PipelineExitStatus start(std::vector<cv::Ptr<FrameProcessor> > processors);
+    cv::Size getImageSize() const;
+};
+
+}
+
+#endif
--- a/apps/interactive-calibration/cvCalibrationFork.cpp
+++ b/apps/interactive-calibration/cvCalibrationFork.cpp
@ -0,0 +1,824 @@
+#include <opencv2/calib3d.hpp>
+#include "linalg.hpp"
+#include "cvCalibrationFork.hpp"
+
+using namespace cv;
+
+static void subMatrix(const cv::Mat& src, cv::Mat& dst, const std::vector<uchar>& cols,
+                      const std::vector<uchar>& rows);
+static const char* cvDistCoeffErr = "Distortion coefficients must be 1x4, 4x1, 1x5, 5x1, 1x8, 8x1, 1x12, 12x1, 1x14 or 14x1 floating-point vector";
+
+static void cvEvaluateJtJ2(CvMat* _JtJ,
+                            const CvMat* camera_matrix,
+                            const CvMat* distortion_coeffs,
+                            const CvMat* object_points,
+                            const CvMat* param,
+                            const CvMat* npoints,
+                            int flags, int NINTRINSIC, double aspectRatio)
+{
+    int i, pos, ni, total = 0, npstep = 0, maxPoints = 0;
+
+    npstep = npoints->rows == 1 ? 1 : npoints->step/CV_ELEM_SIZE(npoints->type);
+    int nimages = npoints->rows*npoints->cols;
+    for( i = 0; i < nimages; i++ )
+    {
+        ni = npoints->data.i[i*npstep];
+        if( ni < 4 )
+        {
+            CV_Error_( CV_StsOutOfRange, ("The number of points in the view #%d is < 4", i));
+        }
+        maxPoints = MAX( maxPoints, ni );
+        total += ni;
+    }
+
+    Mat _Ji( maxPoints*2, NINTRINSIC, CV_64FC1, Scalar(0));
+    Mat _Je( maxPoints*2, 6, CV_64FC1 );
+    Mat _err( maxPoints*2, 1, CV_64FC1 );
+    Mat _m( 1, total, CV_64FC2 );
+    const Mat matM = cvarrToMat(object_points);
+
+    cvZero(_JtJ);
+    for(i = 0, pos = 0; i < nimages; i++, pos += ni )
+    {
+        CvMat _ri, _ti;
+        ni = npoints->data.i[i*npstep];
+
+        cvGetRows( param, &_ri, NINTRINSIC + i*6, NINTRINSIC + i*6 + 3 );
+        cvGetRows( param, &_ti, NINTRINSIC + i*6 + 3, NINTRINSIC + i*6 + 6 );
+
+        CvMat _Mi(matM.colRange(pos, pos + ni));
+        CvMat _mi(_m.colRange(pos, pos + ni));
+
+        _Je.resize(ni*2); _Ji.resize(ni*2); _err.resize(ni*2);
+        CvMat _dpdr(_Je.colRange(0, 3));
+        CvMat _dpdt(_Je.colRange(3, 6));
+        CvMat _dpdf(_Ji.colRange(0, 2));
+        CvMat _dpdc(_Ji.colRange(2, 4));
+        CvMat _dpdk(_Ji.colRange(4, NINTRINSIC));
+        CvMat _mp(_err.reshape(2, 1));
+
+        cvProjectPoints2( &_Mi, &_ri, &_ti, camera_matrix, distortion_coeffs, &_mp, &_dpdr, &_dpdt,
+                          (flags & CALIB_FIX_FOCAL_LENGTH) ? 0 : &_dpdf,
+                          (flags & CALIB_FIX_PRINCIPAL_POINT) ? 0 : &_dpdc, &_dpdk,
+                          (flags & CALIB_FIX_ASPECT_RATIO) ? aspectRatio : 0);
+        cvSub( &_mp, &_mi, &_mp );
+        Mat JtJ(cvarrToMat(_JtJ));
+        // see HZ: (A6.14) for details on the structure of the Jacobian
+        JtJ(Rect(0, 0, NINTRINSIC, NINTRINSIC)) += _Ji.t() * _Ji;
+        JtJ(Rect(NINTRINSIC + i * 6, NINTRINSIC + i * 6, 6, 6)) = _Je.t() * _Je;
+        JtJ(Rect(NINTRINSIC + i * 6, 0, 6, NINTRINSIC)) = _Ji.t() * _Je;
+    }
+}
+
+double cvfork::cvCalibrateCamera2( const CvMat* objectPoints,
+                    const CvMat* imagePoints, const CvMat* npoints,
+                    CvSize imageSize, CvMat* cameraMatrix, CvMat* distCoeffs,
+                    CvMat* rvecs, CvMat* tvecs, CvMat* stdDevs, CvMat* perViewErrors, int flags, CvTermCriteria termCrit )
+{
+    {
+        const int NINTRINSIC = CV_CALIB_NINTRINSIC;
+        double reprojErr = 0;
+
+        Matx33d A;
+        double k[14] = {0};
+        CvMat matA = cvMat(3, 3, CV_64F, A.val), _k;
+        int i, nimages, maxPoints = 0, ni = 0, pos, total = 0, nparams, npstep, cn;
+        double aspectRatio = 0.;
+
+        // 0. check the parameters & allocate buffers
+        if( !CV_IS_MAT(objectPoints) || !CV_IS_MAT(imagePoints) ||
+            !CV_IS_MAT(npoints) || !CV_IS_MAT(cameraMatrix) || !CV_IS_MAT(distCoeffs) )
+            CV_Error( CV_StsBadArg, "One of required vector arguments is not a valid matrix" );
+
+        if( imageSize.width <= 0 || imageSize.height <= 0 )
+            CV_Error( CV_StsOutOfRange, "image width and height must be positive" );
+
+        if( CV_MAT_TYPE(npoints->type) != CV_32SC1 ||
+            (npoints->rows != 1 && npoints->cols != 1) )
+            CV_Error( CV_StsUnsupportedFormat,
+                "the array of point counters must be 1-dimensional integer vector" );
+        if(flags & CV_CALIB_TILTED_MODEL)
+        {
+            //when the tilted sensor model is used the distortion coefficients matrix must have 14 parameters
+            if (distCoeffs->cols*distCoeffs->rows != 14)
+                CV_Error( CV_StsBadArg, "The tilted sensor model must have 14 parameters in the distortion matrix" );
+        }
+        else
+        {
+            //when the thin prism model is used the distortion coefficients matrix must have 12 parameters
+            if(flags & CV_CALIB_THIN_PRISM_MODEL)
+                if (distCoeffs->cols*distCoeffs->rows != 12)
+                    CV_Error( CV_StsBadArg, "Thin prism model must have 12 parameters in the distortion matrix" );
+        }
+
+        nimages = npoints->rows*npoints->cols;
+        npstep = npoints->rows == 1 ? 1 : npoints->step/CV_ELEM_SIZE(npoints->type);
+
+        if( rvecs )
+        {
+            cn = CV_MAT_CN(rvecs->type);
+            if( !CV_IS_MAT(rvecs) ||
+                (CV_MAT_DEPTH(rvecs->type) != CV_32F && CV_MAT_DEPTH(rvecs->type) != CV_64F) ||
+                ((rvecs->rows != nimages || (rvecs->cols*cn != 3 && rvecs->cols*cn != 9)) &&
+                (rvecs->rows != 1 || rvecs->cols != nimages || cn != 3)) )
+                CV_Error( CV_StsBadArg, "the output array of rotation vectors must be 3-channel "
+                    "1xn or nx1 array or 1-channel nx3 or nx9 array, where n is the number of views" );
+        }
+
+        if( tvecs )
+        {
+            cn = CV_MAT_CN(tvecs->type);
+            if( !CV_IS_MAT(tvecs) ||
+                (CV_MAT_DEPTH(tvecs->type) != CV_32F && CV_MAT_DEPTH(tvecs->type) != CV_64F) ||
+                ((tvecs->rows != nimages || tvecs->cols*cn != 3) &&
+                (tvecs->rows != 1 || tvecs->cols != nimages || cn != 3)) )
+                CV_Error( CV_StsBadArg, "the output array of translation vectors must be 3-channel "
+                    "1xn or nx1 array or 1-channel nx3 array, where n is the number of views" );
+        }
+
+        if( stdDevs )
+        {
+            cn = CV_MAT_CN(stdDevs->type);
+            if( !CV_IS_MAT(stdDevs) ||
+                (CV_MAT_DEPTH(stdDevs->type) != CV_32F && CV_MAT_DEPTH(stdDevs->type) != CV_64F) ||
+                ((stdDevs->rows != (nimages*6 + NINTRINSIC) || stdDevs->cols*cn != 1) &&
+                (stdDevs->rows != 1 || stdDevs->cols != (nimages*6 + NINTRINSIC) || cn != 1)) )
+                CV_Error( CV_StsBadArg, "the output array of standard deviations vectors must be 1-channel "
+                    "1x(n*6 + NINTRINSIC) or (n*6 + NINTRINSIC)x1 array, where n is the number of views" );
+        }
+
+        if( (CV_MAT_TYPE(cameraMatrix->type) != CV_32FC1 &&
+            CV_MAT_TYPE(cameraMatrix->type) != CV_64FC1) ||
+            cameraMatrix->rows != 3 || cameraMatrix->cols != 3 )
+            CV_Error( CV_StsBadArg,
+                "Intrinsic parameters must be 3x3 floating-point matrix" );
+
+        if( (CV_MAT_TYPE(distCoeffs->type) != CV_32FC1 &&
+            CV_MAT_TYPE(distCoeffs->type) != CV_64FC1) ||
+            (distCoeffs->cols != 1 && distCoeffs->rows != 1) ||
+            (distCoeffs->cols*distCoeffs->rows != 4 &&
+            distCoeffs->cols*distCoeffs->rows != 5 &&
+            distCoeffs->cols*distCoeffs->rows != 8 &&
+            distCoeffs->cols*distCoeffs->rows != 12 &&
+            distCoeffs->cols*distCoeffs->rows != 14) )
+            CV_Error( CV_StsBadArg, cvDistCoeffErr );
+
+        for( i = 0; i < nimages; i++ )
+        {
+            ni = npoints->data.i[i*npstep];
+            if( ni < 4 )
+            {
+                CV_Error_( CV_StsOutOfRange, ("The number of points in the view #%d is < 4", i));
+            }
+            maxPoints = MAX( maxPoints, ni );
+            total += ni;
+        }
+
+        Mat matM( 1, total, CV_64FC3 );
+        Mat _m( 1, total, CV_64FC2 );
+
+        if(CV_MAT_CN(objectPoints->type) == 3) {
+            cvarrToMat(objectPoints).convertTo(matM, CV_64F);
+        } else {
+            convertPointsHomogeneous(cvarrToMat(objectPoints), matM);
+        }
+
+        if(CV_MAT_CN(imagePoints->type) == 2) {
+            cvarrToMat(imagePoints).convertTo(_m, CV_64F);
+        } else {
+            convertPointsHomogeneous(cvarrToMat(imagePoints), _m);
+        }
+
+        nparams = NINTRINSIC + nimages*6;
+        Mat _Ji( maxPoints*2, NINTRINSIC, CV_64FC1, Scalar(0));
+        Mat _Je( maxPoints*2, 6, CV_64FC1 );
+        Mat _err( maxPoints*2, 1, CV_64FC1 );
+
+        _k = cvMat( distCoeffs->rows, distCoeffs->cols, CV_MAKETYPE(CV_64F,CV_MAT_CN(distCoeffs->type)), k);
+        if( distCoeffs->rows*distCoeffs->cols*CV_MAT_CN(distCoeffs->type) < 8 )
+        {
+            if( distCoeffs->rows*distCoeffs->cols*CV_MAT_CN(distCoeffs->type) < 5 )
+                flags |= CALIB_FIX_K3;
+            flags |= CALIB_FIX_K4 | CALIB_FIX_K5 | CALIB_FIX_K6;
+        }
+        const double minValidAspectRatio = 0.01;
+        const double maxValidAspectRatio = 100.0;
+
+        // 1. initialize intrinsic parameters & LM solver
+        if( flags & CALIB_USE_INTRINSIC_GUESS )
+        {
+            cvConvert( cameraMatrix, &matA );
+            if( A(0, 0) <= 0 || A(1, 1) <= 0 )
+                CV_Error( CV_StsOutOfRange, "Focal length (fx and fy) must be positive" );
+            if( A(0, 2) < 0 || A(0, 2) >= imageSize.width ||
+                A(1, 2) < 0 || A(1, 2) >= imageSize.height )
+                CV_Error( CV_StsOutOfRange, "Principal point must be within the image" );
+            if( fabs(A(0, 1)) > 1e-5 )
+                CV_Error( CV_StsOutOfRange, "Non-zero skew is not supported by the function" );
+            if( fabs(A(1, 0)) > 1e-5 || fabs(A(2, 0)) > 1e-5 ||
+                fabs(A(2, 1)) > 1e-5 || fabs(A(2,2)-1) > 1e-5 )
+                CV_Error( CV_StsOutOfRange,
+                    "The intrinsic matrix must have [fx 0 cx; 0 fy cy; 0 0 1] shape" );
+            A(0, 1) = A(1, 0) = A(2, 0) = A(2, 1) = 0.;
+            A(2, 2) = 1.;
+
+            if( flags & CALIB_FIX_ASPECT_RATIO )
+            {
+                aspectRatio = A(0, 0)/A(1, 1);
+
+                if( aspectRatio < minValidAspectRatio || aspectRatio > maxValidAspectRatio )
+                    CV_Error( CV_StsOutOfRange,
+                        "The specified aspect ratio (= cameraMatrix[0][0] / cameraMatrix[1][1]) is incorrect" );
+            }
+            cvConvert( distCoeffs, &_k );
+        }
+        else
+        {
+            Scalar mean, sdv;
+            meanStdDev(matM, mean, sdv);
+            if( fabs(mean[2]) > 1e-5 || fabs(sdv[2]) > 1e-5 )
+                CV_Error( CV_StsBadArg,
+                "For non-planar calibration rigs the initial intrinsic matrix must be specified" );
+            for( i = 0; i < total; i++ )
+                matM.at<Point3d>(i).z = 0.;
+
+            if( flags & CALIB_FIX_ASPECT_RATIO )
+            {
+                aspectRatio = cvmGet(cameraMatrix,0,0);
+                aspectRatio /= cvmGet(cameraMatrix,1,1);
+                if( aspectRatio < minValidAspectRatio || aspectRatio > maxValidAspectRatio )
+                    CV_Error( CV_StsOutOfRange,
+                        "The specified aspect ratio (= cameraMatrix[0][0] / cameraMatrix[1][1]) is incorrect" );
+            }
+            CvMat _matM(matM), m(_m);
+            cvInitIntrinsicParams2D( &_matM, &m, npoints, imageSize, &matA, aspectRatio );
+        }
+
+        //CvLevMarq solver( nparams, 0, termCrit );
+        cvfork::CvLevMarqFork solver( nparams, 0, termCrit );
+        Mat allErrors(1, total, CV_64FC2);
+
+        if(flags & CALIB_USE_LU) {
+            solver.solveMethod = DECOMP_LU;
+        }
+        else if(flags & CALIB_USE_QR)
+            solver.solveMethod = DECOMP_QR;
+
+        {
+        double* param = solver.param->data.db;
+        uchar* mask = solver.mask->data.ptr;
+
+        param[0] = A(0, 0); param[1] = A(1, 1); param[2] = A(0, 2); param[3] = A(1, 2);
+        std::copy(k, k + 14, param + 4);
+
+        if( flags & CV_CALIB_FIX_FOCAL_LENGTH )
+            mask[0] = mask[1] = 0;
+        if( flags & CV_CALIB_FIX_PRINCIPAL_POINT )
+            mask[2] = mask[3] = 0;
+        if( flags & CV_CALIB_ZERO_TANGENT_DIST )
+        {
+            param[6] = param[7] = 0;
+            mask[6] = mask[7] = 0;
+        }
+        if( !(flags & CALIB_RATIONAL_MODEL) )
+            flags |= CALIB_FIX_K4 + CALIB_FIX_K5 + CALIB_FIX_K6;
+        if( !(flags & CV_CALIB_THIN_PRISM_MODEL))
+            flags |= CALIB_FIX_S1_S2_S3_S4;
+        if( !(flags & CV_CALIB_TILTED_MODEL))
+            flags |= CALIB_FIX_TAUX_TAUY;
+
+        mask[ 4] = !(flags & CALIB_FIX_K1);
+        mask[ 5] = !(flags & CALIB_FIX_K2);
+        mask[ 8] = !(flags & CALIB_FIX_K3);
+        mask[ 9] = !(flags & CALIB_FIX_K4);
+        mask[10] = !(flags & CALIB_FIX_K5);
+        mask[11] = !(flags & CALIB_FIX_K6);
+
+        if(flags & CALIB_FIX_S1_S2_S3_S4)
+        {
+            mask[12] = 0;
+            mask[13] = 0;
+            mask[14] = 0;
+            mask[15] = 0;
+        }
+        if(flags & CALIB_FIX_TAUX_TAUY)
+        {
+            mask[16] = 0;
+            mask[17] = 0;
+        }
+        }
+
+        // 2. initialize extrinsic parameters
+        for( i = 0, pos = 0; i < nimages; i++, pos += ni )
+        {
+            CvMat _ri, _ti;
+            ni = npoints->data.i[i*npstep];
+
+            cvGetRows( solver.param, &_ri, NINTRINSIC + i*6, NINTRINSIC + i*6 + 3 );
+            cvGetRows( solver.param, &_ti, NINTRINSIC + i*6 + 3, NINTRINSIC + i*6 + 6 );
+
+            CvMat _Mi(matM.colRange(pos, pos + ni));
+            CvMat _mi(_m.colRange(pos, pos + ni));
+
+            cvFindExtrinsicCameraParams2( &_Mi, &_mi, &matA, &_k, &_ri, &_ti );
+        }
+
+        // 3. run the optimization
+        for(;;)
+        {
+            const CvMat* _param = 0;
+            CvMat *_JtJ = 0, *_JtErr = 0;
+            double* _errNorm = 0;
+            bool proceed = solver.updateAlt( _param, _JtJ, _JtErr, _errNorm );
+            double *param = solver.param->data.db, *pparam = solver.prevParam->data.db;
+
+            if( flags & CALIB_FIX_ASPECT_RATIO )
+            {
+                param[0] = param[1]*aspectRatio;
+                pparam[0] = pparam[1]*aspectRatio;
+            }
+
+            A(0, 0) = param[0]; A(1, 1) = param[1]; A(0, 2) = param[2]; A(1, 2) = param[3];
+            std::copy(param + 4, param + 4 + 14, k);
+
+            if( !proceed ) {
+                //do errors estimation
+                if(stdDevs) {
+                    Ptr<CvMat> JtJ(cvCreateMat(nparams, nparams, CV_64F));
+                    CvMat cvMatM(matM);
+                    cvEvaluateJtJ2(JtJ, &matA, &_k, &cvMatM, solver.param, npoints, flags, NINTRINSIC, aspectRatio);
+
+                    Mat mask = cvarrToMat(solver.mask);
+                    int nparams_nz = countNonZero(mask);
+                    Mat JtJinv, JtJN;
+                    JtJN.create(nparams_nz, nparams_nz, CV_64F);
+                    subMatrix(cvarrToMat(JtJ), JtJN, mask, mask);
+                    completeSymm(JtJN, false);
+    #ifndef USE_LAPACK
+                    cv::invert(JtJN, JtJinv, DECOMP_SVD);
+    #else
+                    cvfork::invert(JtJN, JtJinv, DECOMP_SVD);
+    #endif
+                    double sigma2 = norm(allErrors, NORM_L2SQR) / (total - nparams_nz);
+                    Mat stdDevsM = cvarrToMat(stdDevs);
+                    int j = 0;
+                    for (int s = 0; s < nparams; s++)
+                        if(mask.data[s]) {
+                            stdDevsM.at<double>(s) = std::sqrt(JtJinv.at<double>(j,j)*sigma2);
+                            j++;
+                        }
+                        else
+                            stdDevsM.at<double>(s) = 0;
+                }
+                break;
+            }
+
+            reprojErr = 0;
+
+            for( i = 0, pos = 0; i < nimages; i++, pos += ni )
+            {
+                CvMat _ri, _ti;
+                ni = npoints->data.i[i*npstep];
+
+                cvGetRows( solver.param, &_ri, NINTRINSIC + i*6, NINTRINSIC + i*6 + 3 );
+                cvGetRows( solver.param, &_ti, NINTRINSIC + i*6 + 3, NINTRINSIC + i*6 + 6 );
+
+                CvMat _Mi(matM.colRange(pos, pos + ni));
+                CvMat _mi(_m.colRange(pos, pos + ni));
+                CvMat _me(allErrors.colRange(pos, pos + ni));
+
+                _Je.resize(ni*2); _Ji.resize(ni*2); _err.resize(ni*2);
+                CvMat _dpdr(_Je.colRange(0, 3));
+                CvMat _dpdt(_Je.colRange(3, 6));
+                CvMat _dpdf(_Ji.colRange(0, 2));
+                CvMat _dpdc(_Ji.colRange(2, 4));
+                CvMat _dpdk(_Ji.colRange(4, NINTRINSIC));
+                CvMat _mp(_err.reshape(2, 1));
+
+                if( solver.state == CvLevMarq::CALC_J )
+                {
+                     cvProjectPoints2( &_Mi, &_ri, &_ti, &matA, &_k, &_mp, &_dpdr, &_dpdt,
+                                      (flags & CALIB_FIX_FOCAL_LENGTH) ? 0 : &_dpdf,
+                                      (flags & CALIB_FIX_PRINCIPAL_POINT) ? 0 : &_dpdc, &_dpdk,
+                                      (flags & CALIB_FIX_ASPECT_RATIO) ? aspectRatio : 0);
+                }
+                else
+                    cvProjectPoints2( &_Mi, &_ri, &_ti, &matA, &_k, &_mp );
+
+                cvSub( &_mp, &_mi, &_mp );
+
+                if( solver.state == CvLevMarq::CALC_J )
+                {
+                    Mat JtJ(cvarrToMat(_JtJ)), JtErr(cvarrToMat(_JtErr));
+
+                    // see HZ: (A6.14) for details on the structure of the Jacobian
+                    JtJ(Rect(0, 0, NINTRINSIC, NINTRINSIC)) += _Ji.t() * _Ji;
+                    JtJ(Rect(NINTRINSIC + i * 6, NINTRINSIC + i * 6, 6, 6)) = _Je.t() * _Je;
+                    JtJ(Rect(NINTRINSIC + i * 6, 0, 6, NINTRINSIC)) = _Ji.t() * _Je;
+
+                    JtErr.rowRange(0, NINTRINSIC) += _Ji.t() * _err;
+                    JtErr.rowRange(NINTRINSIC + i * 6, NINTRINSIC + (i + 1) * 6) = _Je.t() * _err;
+
+                }
+                if (stdDevs || perViewErrors)
+                    cvCopy(&_mp, &_me);
+                reprojErr += norm(_err, NORM_L2SQR);
+            }
+
+            if( _errNorm )
+                *_errNorm = reprojErr;
+        }
+
+        // 4. store the results
+        cvConvert( &matA, cameraMatrix );
+        cvConvert( &_k, distCoeffs );
+
+        for( i = 0, pos = 0; i < nimages; i++)
+        {
+            CvMat src, dst;
+            if( perViewErrors )
+            {
+                ni = npoints->data.i[i*npstep];
+                perViewErrors->data.db[i] = std::sqrt(cv::norm(allErrors.colRange(pos, pos + ni), NORM_L2SQR) / ni);
+                pos+=ni;
+            }
+
+            if( rvecs )
+            {
+                src = cvMat( 3, 1, CV_64F, solver.param->data.db + NINTRINSIC + i*6 );
+                if( rvecs->rows == nimages && rvecs->cols*CV_MAT_CN(rvecs->type) == 9 )
+                {
+                    dst = cvMat( 3, 3, CV_MAT_DEPTH(rvecs->type),
+                        rvecs->data.ptr + rvecs->step*i );
+                    cvRodrigues2( &src, &matA );
+                    cvConvert( &matA, &dst );
+                }
+                else
+                {
+                    dst = cvMat( 3, 1, CV_MAT_DEPTH(rvecs->type), rvecs->rows == 1 ?
+                        rvecs->data.ptr + i*CV_ELEM_SIZE(rvecs->type) :
+                        rvecs->data.ptr + rvecs->step*i );
+                    cvConvert( &src, &dst );
+                }
+            }
+            if( tvecs )
+            {
+                src = cvMat( 3, 1, CV_64F, solver.param->data.db + NINTRINSIC + i*6 + 3 );
+                dst = cvMat( 3, 1, CV_MAT_DEPTH(tvecs->type), tvecs->rows == 1 ?
+                        tvecs->data.ptr + i*CV_ELEM_SIZE(tvecs->type) :
+                        tvecs->data.ptr + tvecs->step*i );
+                cvConvert( &src, &dst );
+             }
+        }
+
+        return std::sqrt(reprojErr/total);
+    }
+}
+
+
+static Mat prepareCameraMatrix(Mat& cameraMatrix0, int rtype)
+{
+    Mat cameraMatrix = Mat::eye(3, 3, rtype);
+    if( cameraMatrix0.size() == cameraMatrix.size() )
+        cameraMatrix0.convertTo(cameraMatrix, rtype);
+    return cameraMatrix;
+}
+
+static Mat prepareDistCoeffs(Mat& distCoeffs0, int rtype)
+{
+    Mat distCoeffs = Mat::zeros(distCoeffs0.cols == 1 ? Size(1, 14) : Size(14, 1), rtype);
+    if( distCoeffs0.size() == Size(1, 4) ||
+       distCoeffs0.size() == Size(1, 5) ||
+       distCoeffs0.size() == Size(1, 8) ||
+       distCoeffs0.size() == Size(1, 12) ||
+       distCoeffs0.size() == Size(1, 14) ||
+       distCoeffs0.size() == Size(4, 1) ||
+       distCoeffs0.size() == Size(5, 1) ||
+       distCoeffs0.size() == Size(8, 1) ||
+       distCoeffs0.size() == Size(12, 1) ||
+       distCoeffs0.size() == Size(14, 1) )
+    {
+        Mat dstCoeffs(distCoeffs, Rect(0, 0, distCoeffs0.cols, distCoeffs0.rows));
+        distCoeffs0.convertTo(dstCoeffs, rtype);
+    }
+    return distCoeffs;
+}
+
+static void collectCalibrationData( InputArrayOfArrays objectPoints,
+                                    InputArrayOfArrays imagePoints1,
+                                    InputArrayOfArrays imagePoints2,
+                                    Mat& objPtMat, Mat& imgPtMat1, Mat* imgPtMat2,
+                                    Mat& npoints )
+{
+    int nimages = (int)objectPoints.total();
+    int i, j = 0, ni = 0, total = 0;
+    CV_Assert(nimages > 0 && nimages == (int)imagePoints1.total() &&
+        (!imgPtMat2 || nimages == (int)imagePoints2.total()));
+
+    for( i = 0; i < nimages; i++ )
+    {
+        ni = objectPoints.getMat(i).checkVector(3, CV_32F);
+        if( ni <= 0 )
+            CV_Error(CV_StsUnsupportedFormat, "objectPoints should contain vector of vectors of points of type Point3f");
+        int ni1 = imagePoints1.getMat(i).checkVector(2, CV_32F);
+        if( ni1 <= 0 )
+            CV_Error(CV_StsUnsupportedFormat, "imagePoints1 should contain vector of vectors of points of type Point2f");
+        CV_Assert( ni == ni1 );
+
+        total += ni;
+    }
+
+    npoints.create(1, (int)nimages, CV_32S);
+    objPtMat.create(1, (int)total, CV_32FC3);
+    imgPtMat1.create(1, (int)total, CV_32FC2);
+    Point2f* imgPtData2 = 0;
+
+    if( imgPtMat2 )
+    {
+        imgPtMat2->create(1, (int)total, CV_32FC2);
+        imgPtData2 = imgPtMat2->ptr<Point2f>();
+    }
+
+    Point3f* objPtData = objPtMat.ptr<Point3f>();
+    Point2f* imgPtData1 = imgPtMat1.ptr<Point2f>();
+
+    for( i = 0; i < nimages; i++, j += ni )
+    {
+        Mat objpt = objectPoints.getMat(i);
+        Mat imgpt1 = imagePoints1.getMat(i);
+        ni = objpt.checkVector(3, CV_32F);
+        npoints.at<int>(i) = ni;
+        memcpy( objPtData + j, objpt.ptr(), ni*sizeof(objPtData[0]) );
+        memcpy( imgPtData1 + j, imgpt1.ptr(), ni*sizeof(imgPtData1[0]) );
+
+        if( imgPtData2 )
+        {
+            Mat imgpt2 = imagePoints2.getMat(i);
+            int ni2 = imgpt2.checkVector(2, CV_32F);
+            CV_Assert( ni == ni2 );
+            memcpy( imgPtData2 + j, imgpt2.ptr(), ni*sizeof(imgPtData2[0]) );
+        }
+    }
+}
+
+double cvfork::calibrateCamera(InputArrayOfArrays _objectPoints,
+                            InputArrayOfArrays _imagePoints,
+                            Size imageSize, InputOutputArray _cameraMatrix, InputOutputArray _distCoeffs,
+                            OutputArrayOfArrays _rvecs, OutputArrayOfArrays _tvecs, OutputArray _stdDeviations, OutputArray _perViewErrors, int flags, TermCriteria criteria )
+{
+    int rtype = CV_64F;
+    Mat cameraMatrix = _cameraMatrix.getMat();
+    cameraMatrix = prepareCameraMatrix(cameraMatrix, rtype);
+    Mat distCoeffs = _distCoeffs.getMat();
+    distCoeffs = prepareDistCoeffs(distCoeffs, rtype);
+    if( !(flags & CALIB_RATIONAL_MODEL) &&
+    (!(flags & CALIB_THIN_PRISM_MODEL)) &&
+    (!(flags & CALIB_TILTED_MODEL)))
+        distCoeffs = distCoeffs.rows == 1 ? distCoeffs.colRange(0, 5) : distCoeffs.rowRange(0, 5);
+
+    int nimages = int(_objectPoints.total());
+    CV_Assert( nimages > 0 );
+    Mat objPt, imgPt, npoints, rvecM, tvecM, stdDeviationsM, errorsM;
+
+    bool rvecs_needed = _rvecs.needed(), tvecs_needed = _tvecs.needed(),
+            stddev_needed = _stdDeviations.needed(), errors_needed = _perViewErrors.needed();
+
+    bool rvecs_mat_vec = _rvecs.isMatVector();
+    bool tvecs_mat_vec = _tvecs.isMatVector();
+
+    if( rvecs_needed ) {
+        _rvecs.create(nimages, 1, CV_64FC3);
+
+        if(rvecs_mat_vec)
+            rvecM.create(nimages, 3, CV_64F);
+        else
+            rvecM = _rvecs.getMat();
+    }
+
+    if( tvecs_needed ) {
+        _tvecs.create(nimages, 1, CV_64FC3);
+
+        if(tvecs_mat_vec)
+            tvecM.create(nimages, 3, CV_64F);
+        else
+            tvecM = _tvecs.getMat();
+    }
+
+    if( stddev_needed ) {
+        _stdDeviations.create(nimages*6 + CV_CALIB_NINTRINSIC, 1, CV_64F);
+        stdDeviationsM = _stdDeviations.getMat();
+    }
+
+    if( errors_needed) {
+        _perViewErrors.create(nimages, 1, CV_64F);
+        errorsM = _perViewErrors.getMat();
+    }
+
+    collectCalibrationData( _objectPoints, _imagePoints, noArray(),
+                            objPt, imgPt, 0, npoints );
+    CvMat c_objPt = objPt, c_imgPt = imgPt, c_npoints = npoints;
+    CvMat c_cameraMatrix = cameraMatrix, c_distCoeffs = distCoeffs;
+    CvMat c_rvecM = rvecM, c_tvecM = tvecM, c_stdDev = stdDeviationsM, c_errors = errorsM;
+
+    double reprojErr = cvfork::cvCalibrateCamera2(&c_objPt, &c_imgPt, &c_npoints, imageSize,
+                                          &c_cameraMatrix, &c_distCoeffs,
+                                          rvecs_needed ? &c_rvecM : NULL,
+                                          tvecs_needed ? &c_tvecM : NULL,
+                                          stddev_needed ? &c_stdDev : NULL,
+                                          errors_needed ? &c_errors : NULL, flags, criteria );
+
+    // overly complicated and inefficient rvec/ tvec handling to support vector<Mat>
+    for(int i = 0; i < nimages; i++ )
+    {
+        if( rvecs_needed && rvecs_mat_vec)
+        {
+            _rvecs.create(3, 1, CV_64F, i, true);
+            Mat rv = _rvecs.getMat(i);
+            memcpy(rv.ptr(), rvecM.ptr(i), 3*sizeof(double));
+        }
+        if( tvecs_needed && tvecs_mat_vec)
+        {
+            _tvecs.create(3, 1, CV_64F, i, true);
+            Mat tv = _tvecs.getMat(i);
+            memcpy(tv.ptr(), tvecM.ptr(i), 3*sizeof(double));
+        }
+    }
+
+    cameraMatrix.copyTo(_cameraMatrix);
+    distCoeffs.copyTo(_distCoeffs);
+
+    return reprojErr;
+}
+
+double cvfork::calibrateCameraCharuco(InputArrayOfArrays _charucoCorners, InputArrayOfArrays _charucoIds,
+                              Ptr<aruco::CharucoBoard> &_board, Size imageSize,
+                              InputOutputArray _cameraMatrix, InputOutputArray _distCoeffs,
+                              OutputArrayOfArrays _rvecs, OutputArrayOfArrays _tvecs, OutputArray _stdDeviations, OutputArray _perViewErrors,
+                              int flags, TermCriteria criteria) {
+
+    CV_Assert(_charucoIds.total() > 0 && (_charucoIds.total() == _charucoCorners.total()));
+
+    // Join object points of charuco corners in a single vector for calibrateCamera() function
+    std::vector< std::vector< Point3f > > allObjPoints;
+    allObjPoints.resize(_charucoIds.total());
+    for(unsigned int i = 0; i < _charucoIds.total(); i++) {
+        unsigned int nCorners = (unsigned int)_charucoIds.getMat(i).total();
+        CV_Assert(nCorners > 0 && nCorners == _charucoCorners.getMat(i).total()); //actually nCorners must be > 3 for calibration
+        allObjPoints[i].reserve(nCorners);
+
+        for(unsigned int j = 0; j < nCorners; j++) {
+            int pointId = _charucoIds.getMat(i).ptr< int >(0)[j];
+            CV_Assert(pointId >= 0 && pointId < (int)_board->chessboardCorners.size());
+            allObjPoints[i].push_back(_board->chessboardCorners[pointId]);
+        }
+    }
+
+    return cvfork::calibrateCamera(allObjPoints, _charucoCorners, imageSize, _cameraMatrix, _distCoeffs,
+                           _rvecs, _tvecs, _stdDeviations, _perViewErrors, flags, criteria);
+}
+
+
+static void subMatrix(const cv::Mat& src, cv::Mat& dst, const std::vector<uchar>& cols,
+                      const std::vector<uchar>& rows) {
+    int nonzeros_cols = cv::countNonZero(cols);
+    cv::Mat tmp(src.rows, nonzeros_cols, CV_64FC1);
+
+    for (int i = 0, j = 0; i < (int)cols.size(); i++)
+    {
+        if (cols[i])
+        {
+            src.col(i).copyTo(tmp.col(j++));
+        }
+    }
+
+    int nonzeros_rows  = cv::countNonZero(rows);
+    dst.create(nonzeros_rows, nonzeros_cols, CV_64FC1);
+    for (int i = 0, j = 0; i < (int)rows.size(); i++)
+    {
+        if (rows[i])
+        {
+            tmp.row(i).copyTo(dst.row(j++));
+        }
+    }
+}
+
+void cvfork::CvLevMarqFork::step()
+{
+    using namespace cv;
+    const double LOG10 = log(10.);
+    double lambda = exp(lambdaLg10*LOG10);
+    int nparams = param->rows;
+
+    Mat _JtJ = cvarrToMat(JtJ);
+    Mat _mask = cvarrToMat(mask);
+
+    int nparams_nz = countNonZero(_mask);
+    if(!JtJN || JtJN->rows != nparams_nz) {
+        // prevent re-allocation in every step
+        JtJN.reset(cvCreateMat( nparams_nz, nparams_nz, CV_64F ));
+        JtJV.reset(cvCreateMat( nparams_nz, 1, CV_64F ));
+        JtJW.reset(cvCreateMat( nparams_nz, 1, CV_64F ));
+    }
+
+    Mat _JtJN = cvarrToMat(JtJN);
+    Mat _JtErr = cvarrToMat(JtJV);
+    Mat_<double> nonzero_param = cvarrToMat(JtJW);
+
+    subMatrix(cvarrToMat(JtErr), _JtErr, std::vector<uchar>(1, 1), _mask);
+    subMatrix(_JtJ, _JtJN, _mask, _mask);
+
+    if( !err )
+        completeSymm( _JtJN, completeSymmFlag );
+#if 1
+    _JtJN.diag() *= 1. + lambda;
+#else
+    _JtJN.diag() += lambda;
+#endif
+#ifndef USE_LAPACK
+    cv::solve(_JtJN, _JtErr, nonzero_param, solveMethod);
+#else
+    cvfork::solve(_JtJN, _JtErr, nonzero_param, solveMethod);
+#endif
+
+    int j = 0;
+    for( int i = 0; i < nparams; i++ )
+        param->data.db[i] = prevParam->data.db[i] - (mask->data.ptr[i] ? nonzero_param(j++) : 0);
+}
+
+cvfork::CvLevMarqFork::CvLevMarqFork(int nparams, int nerrs, CvTermCriteria criteria0, bool _completeSymmFlag)
+{
+    init(nparams, nerrs, criteria0, _completeSymmFlag);
+}
+
+cvfork::CvLevMarqFork::~CvLevMarqFork()
+{
+    clear();
+}
+
+bool cvfork::CvLevMarqFork::updateAlt( const CvMat*& _param, CvMat*& _JtJ, CvMat*& _JtErr, double*& _errNorm )
+{
+    CV_Assert( !err );
+    if( state == DONE )
+    {
+        _param = param;
+        return false;
+    }
+
+    if( state == STARTED )
+    {
+        _param = param;
+        cvZero( JtJ );
+        cvZero( JtErr );
+        errNorm = 0;
+        _JtJ = JtJ;
+        _JtErr = JtErr;
+        _errNorm = &errNorm;
+        state = CALC_J;
+        return true;
+    }
+
+    if( state == CALC_J )
+    {
+        cvCopy( param, prevParam );
+        step();
+        _param = param;
+        prevErrNorm = errNorm;
+        errNorm = 0;
+        _errNorm = &errNorm;
+        state = CHECK_ERR;
+        return true;
+    }
+
+    assert( state == CHECK_ERR );
+    if( errNorm > prevErrNorm )
+    {
+        if( ++lambdaLg10 <= 16 )
+        {
+            step();
+            _param = param;
+            errNorm = 0;
+            _errNorm = &errNorm;
+            state = CHECK_ERR;
+            return true;
+        }
+    }
+
+    lambdaLg10 = MAX(lambdaLg10-1, -16);
+    if( ++iters >= criteria.max_iter ||
+        cvNorm(param, prevParam, CV_RELATIVE_L2) < criteria.epsilon )
+    {
+        //printf("iters %i\n", iters);
+        _param = param;
+        state = DONE;
+        return false;
+    }
+
+    prevErrNorm = errNorm;
+    cvZero( JtJ );
+    cvZero( JtErr );
+    _param = param;
+    _JtJ = JtJ;
+    _JtErr = JtErr;
+    state = CALC_J;
+    return true;
+}
--- a/apps/interactive-calibration/cvCalibrationFork.hpp
+++ b/apps/interactive-calibration/cvCalibrationFork.hpp
@ -0,0 +1,56 @@
+#ifndef CV_CALIBRATION_FORK_HPP
+#define CV_CALIBRATION_FORK_HPP
+
+#include <opencv2/core.hpp>
+#include <opencv2/aruco/charuco.hpp>
+#include <opencv2/calib3d.hpp>
+#include <opencv2/calib3d/calib3d_c.h>
+
+namespace cvfork
+{
+using namespace cv;
+
+#define CV_CALIB_NINTRINSIC 18
+#define CALIB_USE_QR (1 << 18)
+
+double calibrateCamera(InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints, Size imageSize,
+                                     InputOutputArray cameraMatrix, InputOutputArray distCoeffs,
+                                     OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs, OutputArray stdDeviations,
+                                     OutputArray perViewErrors, int flags = 0, TermCriteria criteria = TermCriteria(
+                                        TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );
+
+double cvCalibrateCamera2( const CvMat* object_points,
+                                const CvMat* image_points,
+                                const CvMat* point_counts,
+                                CvSize image_size,
+                                CvMat* camera_matrix,
+                                CvMat* distortion_coeffs,
+                                CvMat* rotation_vectors CV_DEFAULT(NULL),
+                                CvMat* translation_vectors CV_DEFAULT(NULL),
+                                CvMat* stdDeviations_vector CV_DEFAULT(NULL),
+                                CvMat* perViewErrors_vector CV_DEFAULT(NULL),
+                                int flags CV_DEFAULT(0),
+                                CvTermCriteria term_crit CV_DEFAULT(cvTermCriteria(
+                                    CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,30,DBL_EPSILON)) );
+
+double calibrateCameraCharuco(InputArrayOfArrays _charucoCorners, InputArrayOfArrays _charucoIds,
+                              Ptr<aruco::CharucoBoard> &_board, Size imageSize,
+                              InputOutputArray _cameraMatrix, InputOutputArray _distCoeffs,
+                              OutputArrayOfArrays _rvecs, OutputArrayOfArrays _tvecs, OutputArray _stdDeviations, OutputArray _perViewErrors,
+                              int flags = 0, TermCriteria criteria = TermCriteria(
+                                    TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );
+
+class CvLevMarqFork : public CvLevMarq
+{
+public:
+    CvLevMarqFork( int nparams, int nerrs, CvTermCriteria criteria=
+              cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
+              bool completeSymmFlag=false );
+    bool updateAlt( const CvMat*& _param, CvMat*& _JtJ, CvMat*& _JtErr, double*& _errNorm );
+    void step();
+    ~CvLevMarqFork();
+};
+}
+
+#endif
--- a/apps/interactive-calibration/defaultConfig.xml
+++ b/apps/interactive-calibration/defaultConfig.xml
@ -0,0 +1,14 @@
+<?xml version="1.0"?>
+<opencv_storage>
+<charuco_dict>0</charuco_dict>
+<charuco_square_lenght>200</charuco_square_lenght>
+<charuco_marker_size>100</charuco_marker_size>
+<calibration_step>1</calibration_step>
+<max_frames_num>30</max_frames_num>
+<min_frames_num>10</min_frames_num>
+<solver_eps>1e-7</solver_eps>
+<solver_max_iters>30</solver_max_iters>
+<fast_solver>0</fast_solver>
+<frame_filter_conv_param>0.1</frame_filter_conv_param>
+<camera_resolution>800 600</camera_resolution>
+</opencv_storage>
--- a/apps/interactive-calibration/frameProcessor.cpp
+++ b/apps/interactive-calibration/frameProcessor.cpp
@ -0,0 +1,518 @@
+#include "frameProcessor.hpp"
+#include "rotationConverters.hpp"
+
+#include <opencv2/calib3d.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/aruco/charuco.hpp>
+#include <opencv2/highgui.hpp>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <limits>
+
+using namespace calib;
+
+#define VIDEO_TEXT_SIZE 4
+#define POINT_SIZE 5
+
+static cv::SimpleBlobDetector::Params getDetectorParams()
+{
+    cv::SimpleBlobDetector::Params detectorParams;
+
+    detectorParams.thresholdStep = 40;
+    detectorParams.minThreshold = 20;
+    detectorParams.maxThreshold = 500;
+    detectorParams.minRepeatability = 2;
+    detectorParams.minDistBetweenBlobs = 5;
+
+    detectorParams.filterByColor = true;
+    detectorParams.blobColor = 0;
+
+    detectorParams.filterByArea = true;
+    detectorParams.minArea = 5;
+    detectorParams.maxArea = 5000;
+
+    detectorParams.filterByCircularity = false;
+    detectorParams.minCircularity = 0.8f;
+    detectorParams.maxCircularity = std::numeric_limits<float>::max();
+
+    detectorParams.filterByInertia = true;
+    detectorParams.minInertiaRatio = 0.1f;
+    detectorParams.maxInertiaRatio = std::numeric_limits<float>::max();
+
+    detectorParams.filterByConvexity = true;
+    detectorParams.minConvexity = 0.8f;
+    detectorParams.maxConvexity = std::numeric_limits<float>::max();
+
+    return detectorParams;
+}
+
+FrameProcessor::~FrameProcessor()
+{
+
+}
+
+bool CalibProcessor::detectAndParseChessboard(const cv::Mat &frame)
+{
+    int chessBoardFlags = cv::CALIB_CB_ADAPTIVE_THRESH | cv::CALIB_CB_NORMALIZE_IMAGE | cv::CALIB_CB_FAST_CHECK;
+    bool isTemplateFound = cv::findChessboardCorners(frame, mBoardSize, mCurrentImagePoints, chessBoardFlags);
+
+    if (isTemplateFound) {
+        cv::Mat viewGray;
+        cv::cvtColor(frame, viewGray, cv::COLOR_BGR2GRAY);
+        cv::cornerSubPix(viewGray, mCurrentImagePoints, cv::Size(11,11),
+            cv::Size(-1,-1), cv::TermCriteria( cv::TermCriteria::EPS+cv::TermCriteria::COUNT, 30, 0.1 ));
+        cv::drawChessboardCorners(frame, mBoardSize, cv::Mat(mCurrentImagePoints), isTemplateFound);
+        mTemplateLocations.insert(mTemplateLocations.begin(), mCurrentImagePoints[0]);
+    }
+    return isTemplateFound;
+}
+
+bool CalibProcessor::detectAndParseChAruco(const cv::Mat &frame)
+{
+    cv::Ptr<cv::aruco::Board> board = mCharucoBoard.staticCast<cv::aruco::Board>();
+
+    std::vector<std::vector<cv::Point2f> > corners, rejected;
+    std::vector<int> ids;
+    cv::aruco::detectMarkers(frame, mArucoDictionary, corners, ids, cv::aruco::DetectorParameters::create(), rejected);
+    cv::aruco::refineDetectedMarkers(frame, board, corners, ids, rejected);
+    cv::Mat currentCharucoCorners, currentCharucoIds;
+    if(ids.size() > 0)
+        cv::aruco::interpolateCornersCharuco(corners, ids, frame, mCharucoBoard, currentCharucoCorners,
+                                         currentCharucoIds);
+    if(ids.size() > 0) cv::aruco::drawDetectedMarkers(frame, corners);
+
+    if(currentCharucoCorners.total() > 3) {
+        float centerX = 0, centerY = 0;
+        for (int i = 0; i < currentCharucoCorners.size[0]; i++) {
+            centerX += currentCharucoCorners.at<float>(i, 0);
+            centerY += currentCharucoCorners.at<float>(i, 1);
+        }
+        centerX /= currentCharucoCorners.size[0];
+        centerY /= currentCharucoCorners.size[0];
+        //cv::circle(frame, cv::Point2f(centerX, centerY), 10, cv::Scalar(0, 255, 0), 10);
+        mTemplateLocations.insert(mTemplateLocations.begin(), cv::Point2f(centerX, centerY));
+        cv::aruco::drawDetectedCornersCharuco(frame, currentCharucoCorners, currentCharucoIds);
+        mCurrentCharucoCorners = currentCharucoCorners;
+        mCurrentCharucoIds = currentCharucoIds;
+        return true;
+    }
+
+    return false;
+}
+
+bool CalibProcessor::detectAndParseACircles(const cv::Mat &frame)
+{
+    bool isTemplateFound = findCirclesGrid(frame, mBoardSize, mCurrentImagePoints, cv::CALIB_CB_ASYMMETRIC_GRID, mBlobDetectorPtr);
+    if(isTemplateFound) {
+        mTemplateLocations.insert(mTemplateLocations.begin(), mCurrentImagePoints[0]);
+        cv::drawChessboardCorners(frame, mBoardSize, cv::Mat(mCurrentImagePoints), isTemplateFound);
+    }
+    return isTemplateFound;
+}
+
+bool CalibProcessor::detectAndParseDualACircles(const cv::Mat &frame)
+{
+    std::vector<cv::Point2f> blackPointbuf;
+
+    cv::Mat invertedView;
+    cv::bitwise_not(frame, invertedView);
+    bool isWhiteGridFound = cv::findCirclesGrid(frame, mBoardSize, mCurrentImagePoints, cv::CALIB_CB_ASYMMETRIC_GRID, mBlobDetectorPtr);
+    if(!isWhiteGridFound)
+        return false;
+    bool isBlackGridFound = cv::findCirclesGrid(invertedView, mBoardSize, blackPointbuf, cv::CALIB_CB_ASYMMETRIC_GRID, mBlobDetectorPtr);
+
+    if(!isBlackGridFound)
+    {
+        mCurrentImagePoints.clear();
+        return false;
+    }
+    cv::drawChessboardCorners(frame, mBoardSize, cv::Mat(mCurrentImagePoints), isWhiteGridFound);
+    cv::drawChessboardCorners(frame, mBoardSize, cv::Mat(blackPointbuf), isBlackGridFound);
+    mCurrentImagePoints.insert(mCurrentImagePoints.end(), blackPointbuf.begin(), blackPointbuf.end());
+    mTemplateLocations.insert(mTemplateLocations.begin(), mCurrentImagePoints[0]);
+
+    return true;
+}
+
+void CalibProcessor::saveFrameData()
+{
+    std::vector<cv::Point3f> objectPoints;
+
+    switch(mBoardType)
+    {
+    case Chessboard:
+        objectPoints.reserve(mBoardSize.height*mBoardSize.width);
+        for( int i = 0; i < mBoardSize.height; ++i )
+            for( int j = 0; j < mBoardSize.width; ++j )
+                objectPoints.push_back(cv::Point3f(j*mSquareSize, i*mSquareSize, 0));
+        mCalibData->imagePoints.push_back(mCurrentImagePoints);
+        mCalibData->objectPoints.push_back(objectPoints);
+        break;
+    case chAruco:
+        mCalibData->allCharucoCorners.push_back(mCurrentCharucoCorners);
+        mCalibData->allCharucoIds.push_back(mCurrentCharucoIds);
+        break;
+    case AcirclesGrid:
+        objectPoints.reserve(mBoardSize.height*mBoardSize.width);
+        for( int i = 0; i < mBoardSize.height; i++ )
+            for( int j = 0; j < mBoardSize.width; j++ )
+                objectPoints.push_back(cv::Point3f((2*j + i % 2)*mSquareSize, i*mSquareSize, 0));
+        mCalibData->imagePoints.push_back(mCurrentImagePoints);
+        mCalibData->objectPoints.push_back(objectPoints);
+        break;
+    case DoubleAcirclesGrid:
+    {
+        float gridCenterX = (2*((float)mBoardSize.width - 1) + 1)*mSquareSize + mTemplDist / 2;
+        float gridCenterY = (mBoardSize.height - 1)*mSquareSize / 2;
+        objectPoints.reserve(2*mBoardSize.height*mBoardSize.width);
+
+        //white part
+        for( int i = 0; i < mBoardSize.height; i++ )
+            for( int j = 0; j < mBoardSize.width; j++ )
+                objectPoints.push_back(
+                            cv::Point3f(-float((2*j + i % 2)*mSquareSize + mTemplDist +
+                                               (2*(mBoardSize.width - 1) + 1)*mSquareSize - gridCenterX),
+                                        -float(i*mSquareSize) - gridCenterY,
+                                        0));
+        //black part
+        for( int i = 0; i < mBoardSize.height; i++ )
+            for( int j = 0; j < mBoardSize.width; j++ )
+                objectPoints.push_back(cv::Point3f(-float((2*j + i % 2)*mSquareSize - gridCenterX),
+                                          -float(i*mSquareSize) - gridCenterY, 0));
+
+        mCalibData->imagePoints.push_back(mCurrentImagePoints);
+        mCalibData->objectPoints.push_back(objectPoints);
+    }
+        break;
+    }
+}
+
+void CalibProcessor::showCaptureMessage(const cv::Mat& frame, const std::string &message)
+{
+    cv::Point textOrigin(100, 100);
+    double textSize = VIDEO_TEXT_SIZE * frame.cols / (double) IMAGE_MAX_WIDTH;
+    cv::bitwise_not(frame, frame);
+    cv::putText(frame, message, textOrigin, 1, textSize, cv::Scalar(0,0,255), 2, cv::LINE_AA);
+    cv::imshow(mainWindowName, frame);
+    cv::waitKey(300);
+}
+
+bool CalibProcessor::checkLastFrame()
+{
+    bool isFrameBad = false;
+    cv::Mat tmpCamMatrix;
+    const double badAngleThresh = 40;
+
+    if(!mCalibData->cameraMatrix.total()) {
+        tmpCamMatrix = cv::Mat::eye(3, 3, CV_64F);
+        tmpCamMatrix.at<double>(0,0) = 20000;
+        tmpCamMatrix.at<double>(1,1) = 20000;
+        tmpCamMatrix.at<double>(0,2) = mCalibData->imageSize.height/2;
+        tmpCamMatrix.at<double>(1,2) = mCalibData->imageSize.width/2;
+    }
+    else
+        mCalibData->cameraMatrix.copyTo(tmpCamMatrix);
+
+    if(mBoardType != chAruco) {
+        cv::Mat r, t, angles;
+        cv::solvePnP(mCalibData->objectPoints.back(), mCurrentImagePoints, tmpCamMatrix, mCalibData->distCoeffs, r, t);
+        RodriguesToEuler(r, angles, CALIB_DEGREES);
+
+        if(fabs(angles.at<double>(0)) > badAngleThresh || fabs(angles.at<double>(1)) > badAngleThresh) {
+            mCalibData->objectPoints.pop_back();
+            mCalibData->imagePoints.pop_back();
+            isFrameBad = true;
+        }
+    }
+    else {
+        cv::Mat r, t, angles;
+        std::vector<cv::Point3f> allObjPoints;
+        allObjPoints.reserve(mCurrentCharucoIds.total());
+        for(size_t i = 0; i < mCurrentCharucoIds.total(); i++) {
+            int pointID = mCurrentCharucoIds.at<int>((int)i);
+            CV_Assert(pointID >= 0 && pointID < (int)mCharucoBoard->chessboardCorners.size());
+            allObjPoints.push_back(mCharucoBoard->chessboardCorners[pointID]);
+        }
+
+        cv::solvePnP(allObjPoints, mCurrentCharucoCorners, tmpCamMatrix, mCalibData->distCoeffs, r, t);
+        RodriguesToEuler(r, angles, CALIB_DEGREES);
+
+        if(180.0 - fabs(angles.at<double>(0)) > badAngleThresh || fabs(angles.at<double>(1)) > badAngleThresh) {
+            isFrameBad = true;
+            mCalibData->allCharucoCorners.pop_back();
+            mCalibData->allCharucoIds.pop_back();
+        }
+    }
+    return isFrameBad;
+}
+
+CalibProcessor::CalibProcessor(cv::Ptr<calibrationData> data, captureParameters &capParams) :
+    mCalibData(data), mBoardType(capParams.board), mBoardSize(capParams.boardSize)
+{
+    mCapuredFrames = 0;
+    mNeededFramesNum = capParams.calibrationStep;
+    mDelayBetweenCaptures = static_cast<int>(capParams.captureDelay * capParams.fps);
+    mMaxTemplateOffset = std::sqrt(std::pow(mCalibData->imageSize.height, 2) +
+                                   std::pow(mCalibData->imageSize.width, 2)) / 20.0;
+    mSquareSize = capParams.squareSize;
+    mTemplDist = capParams.templDst;
+
+    switch(mBoardType)
+    {
+    case chAruco:
+        mArucoDictionary = cv::aruco::getPredefinedDictionary(
+                    cv::aruco::PREDEFINED_DICTIONARY_NAME(capParams.charucoDictName));
+        mCharucoBoard = cv::aruco::CharucoBoard::create(mBoardSize.width, mBoardSize.height, capParams.charucoSquareLenght,
+                                                        capParams.charucoMarkerSize, mArucoDictionary);
+        break;
+    case AcirclesGrid:
+        mBlobDetectorPtr = cv::SimpleBlobDetector::create();
+        break;
+    case DoubleAcirclesGrid:
+        mBlobDetectorPtr = cv::SimpleBlobDetector::create(getDetectorParams());
+        break;
+    case Chessboard:
+        break;
+    }
+}
+
+cv::Mat CalibProcessor::processFrame(const cv::Mat &frame)
+{
+    cv::Mat frameCopy;
+    frame.copyTo(frameCopy);
+    bool isTemplateFound = false;
+    mCurrentImagePoints.clear();
+
+    switch(mBoardType)
+    {
+    case Chessboard:
+        isTemplateFound = detectAndParseChessboard(frameCopy);
+        break;
+    case chAruco:
+        isTemplateFound = detectAndParseChAruco(frameCopy);
+        break;
+    case AcirclesGrid:
+        isTemplateFound = detectAndParseACircles(frameCopy);
+        break;
+    case DoubleAcirclesGrid:
+        isTemplateFound = detectAndParseDualACircles(frameCopy);
+        break;
+    }
+
+    if(mTemplateLocations.size() > mDelayBetweenCaptures)
+        mTemplateLocations.pop_back();
+    if(mTemplateLocations.size() == mDelayBetweenCaptures && isTemplateFound) {
+        if(cv::norm(mTemplateLocations.front() - mTemplateLocations.back()) < mMaxTemplateOffset) {
+            saveFrameData();
+            bool isFrameBad = checkLastFrame();
+            if (!isFrameBad) {
+                std::string displayMessage = cv::format("Frame # %d captured", std::max(mCalibData->imagePoints.size(),
+                                                                                        mCalibData->allCharucoCorners.size()));
+                if(!showOverlayMessage(displayMessage))
+                    showCaptureMessage(frame, displayMessage);
+                mCapuredFrames++;
+            }
+            else {
+                std::string displayMessage = "Frame rejected";
+                if(!showOverlayMessage(displayMessage))
+                    showCaptureMessage(frame, displayMessage);
+            }
+            mTemplateLocations.clear();
+            mTemplateLocations.reserve(mDelayBetweenCaptures);
+        }
+    }
+
+    return frameCopy;
+}
+
+bool CalibProcessor::isProcessed() const
+{
+    if(mCapuredFrames < mNeededFramesNum)
+        return false;
+    else
+        return true;
+}
+
+void CalibProcessor::resetState()
+{
+    mCapuredFrames = 0;
+    mTemplateLocations.clear();
+}
+
+CalibProcessor::~CalibProcessor()
+{
+
+}
+
+////////////////////////////////////////////
+
+void ShowProcessor::drawBoard(cv::Mat &img, cv::InputArray points)
+{
+    cv::Mat tmpView = cv::Mat::zeros(img.rows, img.cols, CV_8UC3);
+    std::vector<cv::Point2f> templateHull;
+    std::vector<cv::Point> poly;
+    cv::convexHull(points, templateHull);
+    poly.resize(templateHull.size());
+    for(size_t i=0; i<templateHull.size();i++)
+        poly[i] = cv::Point((int)(templateHull[i].x*mGridViewScale), (int)(templateHull[i].y*mGridViewScale));
+    cv::fillConvexPoly(tmpView, poly, cv::Scalar(0, 255, 0), cv::LINE_AA);
+    cv::addWeighted(tmpView, .2, img, 1, 0, img);
+}
+
+void ShowProcessor::drawGridPoints(const cv::Mat &frame)
+{
+    if(mBoardType != chAruco)
+        for(std::vector<std::vector<cv::Point2f> >::iterator it = mCalibdata->imagePoints.begin(); it != mCalibdata->imagePoints.end(); ++it)
+            for(std::vector<cv::Point2f>::iterator pointIt = (*it).begin(); pointIt != (*it).end(); ++pointIt)
+                cv::circle(frame, *pointIt, POINT_SIZE, cv::Scalar(0, 255, 0), 1, cv::LINE_AA);
+    else
+        for(std::vector<cv::Mat>::iterator it = mCalibdata->allCharucoCorners.begin(); it != mCalibdata->allCharucoCorners.end(); ++it)
+            for(int i = 0; i < (*it).size[0]; i++)
+                cv::circle(frame, cv::Point((int)(*it).at<float>(i, 0), (int)(*it).at<float>(i, 1)),
+                           POINT_SIZE, cv::Scalar(0, 255, 0), 1, cv::LINE_AA);
+}
+
+ShowProcessor::ShowProcessor(cv::Ptr<calibrationData> data, cv::Ptr<calibController> controller, TemplateType board) :
+    mCalibdata(data), mController(controller), mBoardType(board)
+{
+    mNeedUndistort = true;
+    mVisMode = Grid;
+    mGridViewScale = 0.5;
+    mTextSize = VIDEO_TEXT_SIZE;
+}
+
+cv::Mat ShowProcessor::processFrame(const cv::Mat &frame)
+{
+    if(mCalibdata->cameraMatrix.size[0] && mCalibdata->distCoeffs.size[0]) {
+        mTextSize = VIDEO_TEXT_SIZE * (double) frame.cols / IMAGE_MAX_WIDTH;
+        cv::Scalar textColor = cv::Scalar(0,0,255);
+        cv::Mat frameCopy;
+
+        if (mNeedUndistort && mController->getFramesNumberState()) {
+            if(mVisMode == Grid)
+                drawGridPoints(frame);
+            cv::remap(frame, frameCopy, mCalibdata->undistMap1, mCalibdata->undistMap2, cv::INTER_LINEAR);
+            int baseLine = 100;
+            cv::Size textSize = cv::getTextSize("Undistorted view", 1, mTextSize, 2, &baseLine);
+            cv::Point textOrigin(baseLine, frame.rows - (int)(2.5*textSize.height));
+            cv::putText(frameCopy, "Undistorted view", textOrigin, 1, mTextSize, textColor, 2, cv::LINE_AA);
+        }
+        else {
+            frame.copyTo(frameCopy);
+            if(mVisMode == Grid)
+                drawGridPoints(frameCopy);
+        }
+        std::string displayMessage;
+        if(mCalibdata->stdDeviations.at<double>(0) == 0)
+            displayMessage = cv::format("F = %d RMS = %.3f", (int)mCalibdata->cameraMatrix.at<double>(0,0), mCalibdata->totalAvgErr);
+        else
+            displayMessage = cv::format("Fx = %d Fy = %d RMS = %.3f", (int)mCalibdata->cameraMatrix.at<double>(0,0),
+                                            (int)mCalibdata->cameraMatrix.at<double>(1,1), mCalibdata->totalAvgErr);
+        if(mController->getRMSState() && mController->getFramesNumberState())
+            displayMessage.append(" OK");
+
+        int baseLine = 100;
+        cv::Size textSize = cv::getTextSize(displayMessage, 1, mTextSize - 1, 2, &baseLine);
+        cv::Point textOrigin = cv::Point(baseLine, 2*textSize.height);
+        cv::putText(frameCopy, displayMessage, textOrigin, 1, mTextSize - 1, textColor, 2, cv::LINE_AA);
+
+        if(mCalibdata->stdDeviations.at<double>(0) == 0)
+            displayMessage = cv::format("DF = %.2f", mCalibdata->stdDeviations.at<double>(1)*sigmaMult);
+        else
+            displayMessage = cv::format("DFx = %.2f DFy = %.2f", mCalibdata->stdDeviations.at<double>(0)*sigmaMult,
+                                                    mCalibdata->stdDeviations.at<double>(1)*sigmaMult);
+        if(mController->getConfidenceIntrervalsState() && mController->getFramesNumberState())
+            displayMessage.append(" OK");
+        cv::putText(frameCopy, displayMessage, cv::Point(baseLine, 4*textSize.height), 1, mTextSize - 1, textColor, 2, cv::LINE_AA);
+
+        if(mController->getCommonCalibrationState()) {
+            displayMessage = cv::format("Calibration is done");
+            cv::putText(frameCopy, displayMessage, cv::Point(baseLine, 6*textSize.height), 1, mTextSize - 1, textColor, 2, cv::LINE_AA);
+        }
+        int calibFlags = mController->getNewFlags();
+        displayMessage = "";
+        if(!(calibFlags & cv::CALIB_FIX_ASPECT_RATIO))
+            displayMessage.append(cv::format("AR=%.3f ", mCalibdata->cameraMatrix.at<double>(0,0)/mCalibdata->cameraMatrix.at<double>(1,1)));
+        if(calibFlags & cv::CALIB_ZERO_TANGENT_DIST)
+            displayMessage.append("TD=0 ");
+        displayMessage.append(cv::format("K1=%.2f K2=%.2f K3=%.2f", mCalibdata->distCoeffs.at<double>(0), mCalibdata->distCoeffs.at<double>(1),
+                                         mCalibdata->distCoeffs.at<double>(4)));
+        cv::putText(frameCopy, displayMessage, cv::Point(baseLine, frameCopy.rows - (int)(1.5*textSize.height)),
+                    1, mTextSize - 1, textColor, 2, cv::LINE_AA);
+        return frameCopy;
+    }
+
+    return frame;
+}
+
+bool ShowProcessor::isProcessed() const
+{
+    return false;
+}
+
+void ShowProcessor::resetState()
+{
+
+}
+
+void ShowProcessor::setVisualizationMode(visualisationMode mode)
+{
+    mVisMode = mode;
+}
+
+void ShowProcessor::switchVisualizationMode()
+{
+    if(mVisMode == Grid) {
+        mVisMode = Window;
+        updateBoardsView();
+    }
+    else {
+        mVisMode = Grid;
+        cv::destroyWindow(gridWindowName);
+    }
+}
+
+void ShowProcessor::clearBoardsView()
+{
+    cv::imshow(gridWindowName, cv::Mat());
+}
+
+void ShowProcessor::updateBoardsView()
+{
+    if(mVisMode == Window) {
+        cv::Size originSize = mCalibdata->imageSize;
+        cv::Mat altGridView = cv::Mat::zeros((int)(originSize.height*mGridViewScale), (int)(originSize.width*mGridViewScale), CV_8UC3);
+        if(mBoardType != chAruco)
+            for(std::vector<std::vector<cv::Point2f> >::iterator it = mCalibdata->imagePoints.begin(); it != mCalibdata->imagePoints.end(); ++it)
+                if(mBoardType != DoubleAcirclesGrid)
+                    drawBoard(altGridView, *it);
+                else {
+                    size_t pointsNum = (*it).size()/2;
+                    std::vector<cv::Point2f> points(pointsNum);
+                    std::copy((*it).begin(), (*it).begin() + pointsNum, points.begin());
+                    drawBoard(altGridView, points);
+                    std::copy((*it).begin() + pointsNum, (*it).begin() + 2*pointsNum, points.begin());
+                    drawBoard(altGridView, points);
+                }
+        else
+            for(std::vector<cv::Mat>::iterator it = mCalibdata->allCharucoCorners.begin(); it != mCalibdata->allCharucoCorners.end(); ++it)
+                drawBoard(altGridView, *it);
+        cv::imshow(gridWindowName, altGridView);
+    }
+}
+
+void ShowProcessor::switchUndistort()
+{
+    mNeedUndistort = !mNeedUndistort;
+}
+
+void ShowProcessor::setUndistort(bool isEnabled)
+{
+    mNeedUndistort = isEnabled;
+}
+
+ShowProcessor::~ShowProcessor()
+{
+
+}
--- a/apps/interactive-calibration/frameProcessor.hpp
+++ b/apps/interactive-calibration/frameProcessor.hpp
@ -0,0 +1,95 @@
+#ifndef FRAME_PROCESSOR_HPP
+#define FRAME_PROCESSOR_HPP
+
+#include <opencv2/core.hpp>
+#include <opencv2/aruco/charuco.hpp>
+#include <opencv2/calib3d.hpp>
+#include "calibCommon.hpp"
+#include "calibController.hpp"
+
+namespace calib
+{
+class FrameProcessor
+{
+protected:
+
+public:
+    virtual ~FrameProcessor();
+    virtual cv::Mat processFrame(const cv::Mat& frame) = 0;
+    virtual bool isProcessed() const = 0;
+    virtual void resetState() = 0;
+};
+
+class CalibProcessor : public FrameProcessor
+{
+protected:
+    cv::Ptr<calibrationData> mCalibData;
+    TemplateType mBoardType;
+    cv::Size mBoardSize;
+    std::vector<cv::Point2f> mTemplateLocations;
+    std::vector<cv::Point2f> mCurrentImagePoints;
+    cv::Mat mCurrentCharucoCorners;
+    cv::Mat mCurrentCharucoIds;
+
+    cv::Ptr<cv::SimpleBlobDetector> mBlobDetectorPtr;
+    cv::Ptr<cv::aruco::Dictionary> mArucoDictionary;
+    cv::Ptr<cv::aruco::CharucoBoard> mCharucoBoard;
+
+    int mNeededFramesNum;
+    unsigned mDelayBetweenCaptures;
+    int mCapuredFrames;
+    double mMaxTemplateOffset;
+    float mSquareSize;
+    float mTemplDist;
+
+    bool detectAndParseChessboard(const cv::Mat& frame);
+    bool detectAndParseChAruco(const cv::Mat& frame);
+    bool detectAndParseACircles(const cv::Mat& frame);
+    bool detectAndParseDualACircles(const cv::Mat& frame);
+    void saveFrameData();
+    void showCaptureMessage(const cv::Mat &frame, const std::string& message);
+    bool checkLastFrame();
+
+public:
+    CalibProcessor(cv::Ptr<calibrationData> data, captureParameters& capParams);
+    virtual cv::Mat processFrame(const cv::Mat& frame);
+    virtual bool isProcessed() const;
+    virtual void resetState();
+    ~CalibProcessor();
+};
+
+enum visualisationMode {Grid, Window};
+
+class ShowProcessor : public FrameProcessor
+{
+protected:
+    cv::Ptr<calibrationData> mCalibdata;
+    cv::Ptr<calibController> mController;
+    TemplateType mBoardType;
+    visualisationMode mVisMode;
+    bool mNeedUndistort;
+    double mGridViewScale;
+    double mTextSize;
+
+    void drawBoard(cv::Mat& img, cv::InputArray points);
+    void drawGridPoints(const cv::Mat& frame);
+public:
+    ShowProcessor(cv::Ptr<calibrationData> data, cv::Ptr<calibController> controller, TemplateType board);
+    virtual cv::Mat processFrame(const cv::Mat& frame);
+    virtual bool isProcessed() const;
+    virtual void resetState();
+
+    void setVisualizationMode(visualisationMode mode);
+    void switchVisualizationMode();
+    void clearBoardsView();
+    void updateBoardsView();
+
+    void switchUndistort();
+    void setUndistort(bool isEnabled);
+    ~ShowProcessor();
+};
+
+}
+
+
+#endif
--- a/apps/interactive-calibration/linalg.cpp
+++ b/apps/interactive-calibration/linalg.cpp
@ -0,0 +1,491 @@
+#include "linalg.hpp"
+
+#ifdef USE_LAPACK
+
+typedef int    integer;
+#include <lapacke.h>
+
+#include <cassert>
+using namespace cv;
+
+bool cvfork::solve(InputArray _src, const InputArray _src2arg, OutputArray _dst, int method )
+    {
+        bool result = true;
+        Mat src = _src.getMat(), _src2 = _src2arg.getMat();
+        int type = src.type();
+        bool is_normal = (method & DECOMP_NORMAL) != 0;
+
+        CV_Assert( type == _src2.type() && (type == CV_32F || type == CV_64F) );
+
+        method &= ~DECOMP_NORMAL;
+        CV_Assert( (method != DECOMP_LU && method != DECOMP_CHOLESKY) ||
+            is_normal || src.rows == src.cols );
+
+        double rcond=-1, s1=0, work1=0, *work=0, *s=0;
+        float frcond=-1, fs1=0, fwork1=0, *fwork=0, *fs=0;
+        integer m = src.rows, m_ = m, n = src.cols, mn = std::max(m,n),
+            nm = std::min(m, n), nb = _src2.cols, lwork=-1, liwork=0, iwork1=0,
+            lda = m, ldx = mn, info=0, rank=0, *iwork=0;
+        int elem_size = CV_ELEM_SIZE(type);
+        bool copy_rhs=false;
+        int buf_size=0;
+        AutoBuffer<uchar> buffer;
+        uchar* ptr;
+        char N[] = {'N', '\0'}, L[] = {'L', '\0'};
+
+        Mat src2 = _src2;
+        _dst.create( src.cols, src2.cols, src.type() );
+        Mat dst = _dst.getMat();
+
+        if( m <= n )
+            is_normal = false;
+        else if( is_normal )
+            m_ = n;
+
+        buf_size += (is_normal ? n*n : m*n)*elem_size;
+
+        if( m_ != n || nb > 1 || !dst.isContinuous() )
+        {
+            copy_rhs = true;
+            if( is_normal )
+                buf_size += n*nb*elem_size;
+            else
+                buf_size += mn*nb*elem_size;
+        }
+
+        if( method == DECOMP_SVD || method == DECOMP_EIG )
+        {
+            integer nlvl = cvRound(std::log(std::max(std::min(m_,n)/25., 1.))/CV_LOG2) + 1;
+            liwork = std::min(m_,n)*(3*std::max(nlvl,(integer)0) + 11);
+
+            if( type == CV_32F )
+                sgelsd_(&m_, &n, &nb, (float*)src.data, &lda, (float*)dst.data, &ldx,
+                    &fs1, &frcond, &rank, &fwork1, &lwork, &iwork1, &info);
+            else
+                dgelsd_(&m_, &n, &nb, (double*)src.data, &lda, (double*)dst.data, &ldx,
+                    &s1, &rcond, &rank, &work1, &lwork, &iwork1, &info );
+            buf_size += nm*elem_size + (liwork + 1)*sizeof(integer);
+        }
+        else if( method == DECOMP_QR )
+        {
+            if( type == CV_32F )
+                sgels_(N, &m_, &n, &nb, (float*)src.data, &lda,
+                    (float*)dst.data, &ldx, &fwork1, &lwork, &info );
+            else
+                dgels_(N, &m_, &n, &nb, (double*)src.data, &lda,
+                    (double*)dst.data, &ldx, &work1, &lwork, &info );
+        }
+        else if( method == DECOMP_LU )
+        {
+            buf_size += (n+1)*sizeof(integer);
+        }
+        else if( method == DECOMP_CHOLESKY )
+            ;
+        else
+            CV_Error( Error::StsBadArg, "Unknown method" );
+        assert(info == 0);
+
+        lwork = cvRound(type == CV_32F ? (double)fwork1 : work1);
+        buf_size += lwork*elem_size;
+        buffer.allocate(buf_size);
+        ptr = (uchar*)buffer;
+
+        Mat at(n, m_, type, ptr);
+        ptr += n*m_*elem_size;
+
+        if( method == DECOMP_CHOLESKY || method == DECOMP_EIG )
+            src.copyTo(at);
+        else if( !is_normal )
+            transpose(src, at);
+        else
+            mulTransposed(src, at, true);
+
+        Mat xt;
+        if( !is_normal )
+        {
+            if( copy_rhs )
+            {
+                Mat temp(nb, mn, type, ptr);
+                ptr += nb*mn*elem_size;
+                Mat bt = temp.colRange(0, m);
+                xt = temp.colRange(0, n);
+                transpose(src2, bt);
+            }
+            else
+            {
+                src2.copyTo(dst);
+                xt = Mat(1, n, type, dst.data);
+            }
+        }
+        else
+        {
+            if( copy_rhs )
+            {
+                xt = Mat(nb, n, type, ptr);
+                ptr += nb*n*elem_size;
+            }
+            else
+                xt = Mat(1, n, type, dst.data);
+            // (a'*b)' = b'*a
+            gemm( src2, src, 1, Mat(), 0, xt, GEMM_1_T );
+        }
+
+        lda = (int)(at.step ? at.step/elem_size : at.cols);
+        ldx = (int)(xt.step ? xt.step/elem_size : (!is_normal && copy_rhs ? mn : n));
+
+        if( method == DECOMP_SVD || method == DECOMP_EIG )
+        {
+            if( type == CV_32F )
+            {
+                fs = (float*)ptr;
+                ptr += nm*elem_size;
+                fwork = (float*)ptr;
+                ptr += lwork*elem_size;
+                iwork = (integer*)alignPtr(ptr, sizeof(integer));
+
+                sgelsd_(&m_, &n, &nb, (float*)at.data, &lda, (float*)xt.data, &ldx,
+                    fs, &frcond, &rank, fwork, &lwork, iwork, &info);
+            }
+            else
+            {
+                s = (double*)ptr;
+                ptr += nm*elem_size;
+                work = (double*)ptr;
+                ptr += lwork*elem_size;
+                iwork = (integer*)alignPtr(ptr, sizeof(integer));
+
+                dgelsd_(&m_, &n, &nb, (double*)at.data, &lda, (double*)xt.data, &ldx,
+                    s, &rcond, &rank, work, &lwork, iwork, &info);
+            }
+        }
+        else if( method == DECOMP_QR )
+        {
+            if( type == CV_32F )
+            {
+                fwork = (float*)ptr;
+                sgels_(N, &m_, &n, &nb, (float*)at.data, &lda,
+                    (float*)xt.data, &ldx, fwork, &lwork, &info);
+            }
+            else
+            {
+                work = (double*)ptr;
+                dgels_(N, &m_, &n, &nb, (double*)at.data, &lda,
+                    (double*)xt.data, &ldx, work, &lwork, &info);
+            }
+        }
+        else if( method == DECOMP_CHOLESKY || (method == DECOMP_LU && is_normal) )
+        {
+            if( type == CV_32F )
+            {
+                spotrf_(L, &n, (float*)at.data, &lda, &info);
+                if(info==0)
+                    spotrs_(L, &n, &nb, (float*)at.data, &lda, (float*)xt.data, &ldx, &info);
+            }
+            else
+            {
+                dpotrf_(L, &n, (double*)at.data, &lda, &info);
+                if(info==0)
+                    dpotrs_(L, &n, &nb, (double*)at.data, &lda, (double*)xt.data, &ldx, &info);
+            }
+        }
+        else if( method == DECOMP_LU )
+        {
+            iwork = (integer*)alignPtr(ptr, sizeof(integer));
+            if( type == CV_32F )
+                sgesv_(&n, &nb, (float*)at.data, &lda, iwork, (float*)xt.data, &ldx, &info );
+            else
+                dgesv_(&n, &nb, (double*)at.data, &lda, iwork, (double*)xt.data, &ldx, &info );
+        }
+        else
+            assert(0);
+        result = info == 0;
+
+        if( !result )
+            dst = Scalar(0);
+        else if( xt.data != dst.data )
+            transpose( xt, dst );
+
+        return result;
+    }
+
+static void _SVDcompute( const InputArray _aarr, OutputArray _w,
+                         OutputArray _u, OutputArray _vt, int flags = 0)
+{
+    Mat a = _aarr.getMat(), u, vt;
+    integer m = a.rows, n = a.cols, mn = std::max(m, n), nm = std::min(m, n);
+    int type = a.type(), elem_size = (int)a.elemSize();
+    bool compute_uv = _u.needed() || _vt.needed();
+
+    if( flags & SVD::NO_UV )
+    {
+        _u.release();
+        _vt.release();
+        compute_uv = false;
+    }
+
+    if( compute_uv )
+    {
+        _u.create( (int)m, (int)((flags & SVD::FULL_UV) ? m : nm), type );
+        _vt.create( (int)((flags & SVD::FULL_UV) ? n : nm), n, type );
+        u = _u.getMat();
+        vt = _vt.getMat();
+    }
+
+    _w.create(nm, 1, type, -1, true);
+
+    Mat _a = a, w = _w.getMat();
+    CV_Assert( w.isContinuous() );
+    int work_ofs=0, iwork_ofs=0, buf_size = 0;
+    bool temp_a = false;
+    double u1=0, v1=0, work1=0;
+    float uf1=0, vf1=0, workf1=0;
+    integer lda, ldu, ldv, lwork=-1, iwork1=0, info=0;
+    char mode[] = {compute_uv ? 'S' : 'N', '\0'};
+
+    if( m != n && compute_uv && (flags & SVD::FULL_UV) )
+        mode[0] = 'A';
+
+    if( !(flags & SVD::MODIFY_A) )
+    {
+        if( mode[0] == 'N' || mode[0] == 'A' )
+            temp_a = true;
+        else if( compute_uv && (a.size() == vt.size() || a.size() == u.size()) && mode[0] == 'S' )
+            mode[0] = 'O';
+    }
+
+    lda = a.cols;
+    ldv = ldu = mn;
+
+    if( type == CV_32F )
+    {
+        sgesdd_(mode, &n, &m, (float*)a.data, &lda, (float*)w.data,
+                &vf1, &ldv, &uf1, &ldu, &workf1, &lwork, &iwork1, &info );
+        lwork = cvRound(workf1);
+    }
+    else
+    {
+        dgesdd_(mode, &n, &m, (double*)a.data, &lda, (double*)w.data,
+                &v1, &ldv, &u1, &ldu, &work1, &lwork, &iwork1, &info );
+        lwork = cvRound(work1);
+    }
+
+    assert(info == 0);
+    if( temp_a )
+    {
+        buf_size += n*m*elem_size;
+    }
+    work_ofs = buf_size;
+    buf_size += lwork*elem_size;
+    buf_size = alignSize(buf_size, sizeof(integer));
+    iwork_ofs = buf_size;
+    buf_size += 8*nm*sizeof(integer);
+
+    AutoBuffer<uchar> buf(buf_size);
+    uchar* buffer = (uchar*)buf;
+
+    if( temp_a )
+    {
+        _a = Mat(a.rows, a.cols, type, buffer );
+        a.copyTo(_a);
+    }
+
+    if( !(flags & SVD::MODIFY_A) && !temp_a )
+    {
+        if( compute_uv && a.size() == vt.size() )
+        {
+            a.copyTo(vt);
+            _a = vt;
+        }
+        else if( compute_uv && a.size() == u.size() )
+        {
+            a.copyTo(u);
+            _a = u;
+        }
+    }
+
+    if( compute_uv )
+    {
+        ldv = (int)(vt.step ? vt.step/elem_size : vt.cols);
+        ldu = (int)(u.step ? u.step/elem_size : u.cols);
+    }
+
+    lda = (int)(_a.step ? _a.step/elem_size : _a.cols);
+    if( type == CV_32F )
+    {
+        sgesdd_(mode, &n, &m, _a.ptr<float>(), &lda, w.ptr<float>(),
+                vt.data ? vt.ptr<float>() : (float*)&v1, &ldv,
+                u.data ? u.ptr<float>() : (float*)&u1, &ldu,
+                (float*)(buffer + work_ofs), &lwork,
+                (integer*)(buffer + iwork_ofs), &info );
+    }
+    else
+    {
+        dgesdd_(mode, &n, &m, _a.ptr<double>(), &lda, w.ptr<double>(),
+                vt.data ? vt.ptr<double>() : &v1, &ldv,
+                u.data ? u.ptr<double>() : &u1, &ldu,
+                (double*)(buffer + work_ofs), &lwork,
+                (integer*)(buffer + iwork_ofs), &info );
+    }
+    CV_Assert(info >= 0);
+    if(info != 0)
+    {
+        if( u.data )
+            u = Scalar(0.);
+        if( vt.data )
+            vt = Scalar(0.);
+        w = Scalar(0.);
+    }
+}
+//////////////////////////////////////////////////////////
+template<typename T1, typename T2, typename T3> static void
+MatrAXPY( int m, int n, const T1* x, int dx,
+          const T2* a, int inca, T3* y, int dy )
+{
+    int i, j;
+    for( i = 0; i < m; i++, x += dx, y += dy )
+    {
+        T2 s = a[i*inca];
+        for( j = 0; j <= n - 4; j += 4 )
+        {
+            T3 t0 = (T3)(y[j]   + s*x[j]);
+            T3 t1 = (T3)(y[j+1] + s*x[j+1]);
+            y[j]   = t0;
+            y[j+1] = t1;
+            t0 = (T3)(y[j+2] + s*x[j+2]);
+            t1 = (T3)(y[j+3] + s*x[j+3]);
+            y[j+2] = t0;
+            y[j+3] = t1;
+        }
+
+        for( ; j < n; j++ )
+            y[j] = (T3)(y[j] + s*x[j]);
+    }
+}
+template<typename T> static void
+SVBkSb( int m, int n, const T* w, int incw,
+        const T* u, int ldu, int uT,
+        const T* v, int ldv, int vT,
+        const T* b, int ldb, int nb,
+        T* x, int ldx, double* buffer, T eps )
+{
+    double threshold = 0;
+    int udelta0 = uT ? ldu : 1, udelta1 = uT ? 1 : ldu;
+    int vdelta0 = vT ? ldv : 1, vdelta1 = vT ? 1 : ldv;
+    int i, j, nm = std::min(m, n);
+
+    if( !b )
+        nb = m;
+
+    for( i = 0; i < n; i++ )
+        for( j = 0; j < nb; j++ )
+            x[i*ldx + j] = 0;
+
+    for( i = 0; i < nm; i++ )
+        threshold += w[i*incw];
+    threshold *= eps;
+
+    // v * inv(w) * uT * b
+    for( i = 0; i < nm; i++, u += udelta0, v += vdelta0 )
+    {
+        double wi = w[i*incw];
+        if( wi <= threshold )
+            continue;
+        wi = 1/wi;
+
+        if( nb == 1 )
+        {
+            double s = 0;
+            if( b )
+                for( j = 0; j < m; j++ )
+                    s += u[j*udelta1]*b[j*ldb];
+            else
+                s = u[0];
+            s *= wi;
+
+            for( j = 0; j < n; j++ )
+                x[j*ldx] = (T)(x[j*ldx] + s*v[j*vdelta1]);
+        }
+        else
+        {
+            if( b )
+            {
+                for( j = 0; j < nb; j++ )
+                    buffer[j] = 0;
+                MatrAXPY( m, nb, b, ldb, u, udelta1, buffer, 0 );
+                for( j = 0; j < nb; j++ )
+                    buffer[j] *= wi;
+            }
+            else
+            {
+                for( j = 0; j < nb; j++ )
+                    buffer[j] = u[j*udelta1]*wi;
+            }
+            MatrAXPY( n, nb, buffer, 0, v, vdelta1, x, ldx );
+        }
+    }
+}
+
+static void _backSubst( const InputArray _w, const InputArray _u, const InputArray _vt,
+                     const InputArray _rhs, OutputArray _dst )
+{
+    Mat w = _w.getMat(), u = _u.getMat(), vt = _vt.getMat(), rhs = _rhs.getMat();
+    int type = w.type(), esz = (int)w.elemSize();
+    int m = u.rows, n = vt.cols, nb = rhs.data ? rhs.cols : m;
+    AutoBuffer<double> buffer(nb);
+    CV_Assert( u.data && vt.data && w.data );
+
+    CV_Assert( rhs.data == 0 || (rhs.type() == type && rhs.rows == m) );
+
+    _dst.create( n, nb, type );
+    Mat dst = _dst.getMat();
+    if( type == CV_32F )
+        SVBkSb(m, n, (float*)w.data, 1, (float*)u.data, (int)(u.step/esz), false,
+               (float*)vt.data, (int)(vt.step/esz), true, (float*)rhs.data, (int)(rhs.step/esz),
+               nb, (float*)dst.data, (int)(dst.step/esz), buffer, 10*FLT_EPSILON );
+    else if( type == CV_64F )
+        SVBkSb(m, n, (double*)w.data, 1, (double*)u.data, (int)(u.step/esz), false,
+               (double*)vt.data, (int)(vt.step/esz), true, (double*)rhs.data, (int)(rhs.step/esz),
+               nb, (double*)dst.data, (int)(dst.step/esz), buffer, 2*DBL_EPSILON );
+    else
+        CV_Error( Error::StsUnsupportedFormat, "" );
+}
+///////////////////////////////////////////
+
+#define Sf( y, x ) ((float*)(srcdata + y*srcstep))[x]
+#define Sd( y, x ) ((double*)(srcdata + y*srcstep))[x]
+#define Df( y, x ) ((float*)(dstdata + y*dststep))[x]
+#define Dd( y, x ) ((double*)(dstdata + y*dststep))[x]
+
+double cvfork::invert( InputArray _src, OutputArray _dst, int method )
+{
+    Mat src = _src.getMat();
+    int type = src.type();
+
+    CV_Assert(type == CV_32F || type == CV_64F);
+
+    size_t esz = CV_ELEM_SIZE(type);
+    int m = src.rows, n = src.cols;
+
+    if( method == DECOMP_SVD )
+    {
+        int nm = std::min(m, n);
+
+        AutoBuffer<uchar> _buf((m*nm + nm + nm*n)*esz + sizeof(double));
+        uchar* buf = alignPtr((uchar*)_buf, (int)esz);
+        Mat u(m, nm, type, buf);
+        Mat w(nm, 1, type, u.ptr() + m*nm*esz);
+        Mat vt(nm, n, type, w.ptr() + nm*esz);
+
+        _SVDcompute(src, w, u, vt);
+        _backSubst(w, u, vt, Mat(), _dst);
+
+        return type == CV_32F ?
+            (w.ptr<float>()[0] >= FLT_EPSILON ?
+             w.ptr<float>()[n-1]/w.ptr<float>()[0] : 0) :
+            (w.ptr<double>()[0] >= DBL_EPSILON ?
+             w.ptr<double>()[n-1]/w.ptr<double>()[0] : 0);
+    }
+    return 0;
+}
+
+#endif //USE_LAPACK
--- a/apps/interactive-calibration/linalg.hpp
+++ b/apps/interactive-calibration/linalg.hpp
@ -0,0 +1,13 @@
+#ifndef LINALG_HPP
+#define LINALG_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cvfork {
+
+double invert( cv::InputArray _src, cv::OutputArray _dst, int method );
+bool solve(cv::InputArray _src, cv::InputArray _src2arg, cv::OutputArray _dst, int method );
+
+}
+
+#endif
--- a/apps/interactive-calibration/main.cpp
+++ b/apps/interactive-calibration/main.cpp
@ -0,0 +1,210 @@
+#include <opencv2/core.hpp>
+#include <opencv2/calib3d.hpp>
+#include <opencv2/aruco/charuco.hpp>
+#include <opencv2/cvconfig.h>
+#include <opencv2/highgui.hpp>
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include <algorithm>
+#include <iostream>
+
+#include "calibCommon.hpp"
+#include "calibPipeline.hpp"
+#include "frameProcessor.hpp"
+#include "cvCalibrationFork.hpp"
+#include "calibController.hpp"
+#include "parametersController.hpp"
+#include "rotationConverters.hpp"
+
+using namespace calib;
+
+const std::string keys  =
+        "{v        |         | Input from video file }"
+        "{ci       | 0       | Default camera id }"
+        "{flip     | false   | Vertical flip of input frames }"
+        "{t        | circles | Template for calibration (circles, chessboard, dualCircles, chAruco) }"
+        "{sz       | 16.3    | Distance between two nearest centers of circles or squares on calibration board}"
+        "{dst      | 295     | Distance between white and black parts of daulCircles template}"
+        "{w        |         | Width of template (in corners or circles)}"
+        "{h        |         | Height of template (in corners or circles)}"
+        "{of       | cameraParameters.xml | Output file name}"
+        "{ft       | true    | Auto tuning of calibration flags}"
+        "{vis      | grid    | Captured boards visualisation (grid, window)}"
+        "{d        | 0.8     | Min delay between captures}"
+        "{pf       | defaultConfig.xml| Advanced application parameters}"
+        "{help     |         | Print help}";
+
+bool calib::showOverlayMessage(const std::string& message)
+{
+#ifdef HAVE_QT
+    cv::displayOverlay(mainWindowName, message, OVERLAY_DELAY);
+    return true;
+#else
+    std::cout << message << std::endl;
+    return false;
+#endif
+}
+
+static void deleteButton(int state, void* data)
+{
+    state++; //to avoid gcc warnings
+    (static_cast<cv::Ptr<calibDataController>*>(data))->get()->deleteLastFrame();
+    calib::showOverlayMessage("Last frame deleted");
+}
+
+static void deleteAllButton(int state, void* data)
+{
+    state++;
+    (static_cast<cv::Ptr<calibDataController>*>(data))->get()->deleteAllData();
+    calib::showOverlayMessage("All frames deleted");
+}
+
+static void saveCurrentParamsButton(int state, void* data)
+{
+    state++;
+    if((static_cast<cv::Ptr<calibDataController>*>(data))->get()->saveCurrentCameraParameters())
+        calib::showOverlayMessage("Calibration parameters saved");
+}
+
+#ifdef HAVE_QT
+static void switchVisualizationModeButton(int state, void* data)
+{
+    state++;
+    ShowProcessor* processor = static_cast<ShowProcessor*>(((cv::Ptr<FrameProcessor>*)data)->get());
+    processor->switchVisualizationMode();
+}
+
+static void undistortButton(int state, void* data)
+{
+    ShowProcessor* processor = static_cast<ShowProcessor*>(((cv::Ptr<FrameProcessor>*)data)->get());
+    processor->setUndistort(static_cast<bool>(state));
+    calib::showOverlayMessage(std::string("Undistort is ") +
+                       (static_cast<bool>(state) ? std::string("on") : std::string("off")));
+}
+#endif //HAVE_QT
+
+int main(int argc, char** argv)
+{
+    cv::CommandLineParser parser(argc, argv, keys);
+    if(parser.has("help")) {
+        parser.printMessage();
+        return 0;
+    }
+    std::cout << consoleHelp << std::endl;
+    parametersController paramsController;
+
+    if(!paramsController.loadFromParser(parser))
+        return 0;
+
+    captureParameters capParams = paramsController.getCaptureParameters();
+    internalParameters intParams = paramsController.getInternalParameters();
+
+    cv::TermCriteria solverTermCrit = cv::TermCriteria(cv::TermCriteria::COUNT+cv::TermCriteria::EPS,
+                                                       intParams.solverMaxIters, intParams.solverEps);
+    cv::Ptr<calibrationData> globalData(new calibrationData);
+    if(!parser.has("v")) globalData->imageSize = capParams.cameraResolution;
+
+    int calibrationFlags = 0;
+    if(intParams.fastSolving) calibrationFlags |= CALIB_USE_QR;
+    cv::Ptr<calibController> controller(new calibController(globalData, calibrationFlags,
+                                                         parser.get<bool>("ft"), capParams.minFramesNum));
+    cv::Ptr<calibDataController> dataController(new calibDataController(globalData, capParams.maxFramesNum,
+                                                                     intParams.filterAlpha));
+    dataController->setParametersFileName(parser.get<std::string>("of"));
+
+    cv::Ptr<FrameProcessor> capProcessor, showProcessor;
+    capProcessor = cv::Ptr<FrameProcessor>(new CalibProcessor(globalData, capParams));
+    showProcessor = cv::Ptr<FrameProcessor>(new ShowProcessor(globalData, controller, capParams.board));
+
+    if(parser.get<std::string>("vis").find("window") == 0) {
+        static_cast<ShowProcessor*>(showProcessor.get())->setVisualizationMode(Window);
+        cv::namedWindow(gridWindowName);
+        cv::moveWindow(gridWindowName, 1280, 500);
+    }
+
+    cv::Ptr<CalibPipeline> pipeline(new CalibPipeline(capParams));
+    std::vector<cv::Ptr<FrameProcessor> > processors;
+    processors.push_back(capProcessor);
+    processors.push_back(showProcessor);
+
+    cv::namedWindow(mainWindowName);
+    cv::moveWindow(mainWindowName, 10, 10);
+#ifdef HAVE_QT
+    cv::createButton("Delete last frame", deleteButton, &dataController, cv::QT_PUSH_BUTTON);
+    cv::createButton("Delete all frames", deleteAllButton, &dataController, cv::QT_PUSH_BUTTON);
+    cv::createButton("Undistort", undistortButton, &showProcessor, cv::QT_CHECKBOX, false);
+    cv::createButton("Save current parameters", saveCurrentParamsButton, &dataController, cv::QT_PUSH_BUTTON);
+    cv::createButton("Switch visualisation mode", switchVisualizationModeButton, &showProcessor, cv::QT_PUSH_BUTTON);
+#endif //HAVE_QT
+    try {
+        bool pipelineFinished = false;
+        while(!pipelineFinished)
+        {
+            PipelineExitStatus exitStatus = pipeline->start(processors);
+            if (exitStatus == Finished) {
+                if(controller->getCommonCalibrationState())
+                    saveCurrentParamsButton(0, &dataController);
+                pipelineFinished = true;
+                continue;
+            }
+            else if (exitStatus == Calibrate) {
+
+                dataController->rememberCurrentParameters();
+                globalData->imageSize = pipeline->getImageSize();
+                calibrationFlags = controller->getNewFlags();
+
+                if(capParams.board != chAruco) {
+                    globalData->totalAvgErr =
+                            cvfork::calibrateCamera(globalData->objectPoints, globalData->imagePoints,
+                                                    globalData->imageSize, globalData->cameraMatrix,
+                                                    globalData->distCoeffs, cv::noArray(), cv::noArray(),
+                                                    globalData->stdDeviations, globalData->perViewErrors,
+                                                    calibrationFlags, solverTermCrit);
+                }
+                else {
+                    cv::Ptr<cv::aruco::Dictionary> dictionary =
+                            cv::aruco::getPredefinedDictionary(cv::aruco::PREDEFINED_DICTIONARY_NAME(capParams.charucoDictName));
+                    cv::Ptr<cv::aruco::CharucoBoard> charucoboard =
+                                cv::aruco::CharucoBoard::create(capParams.boardSize.width, capParams.boardSize.height,
+                                                                capParams.charucoSquareLenght, capParams.charucoMarkerSize, dictionary);
+                    globalData->totalAvgErr =
+                            cvfork::calibrateCameraCharuco(globalData->allCharucoCorners, globalData->allCharucoIds,
+                                                           charucoboard, globalData->imageSize,
+                                                           globalData->cameraMatrix, globalData->distCoeffs,
+                                                           cv::noArray(), cv::noArray(), globalData->stdDeviations,
+                                                           globalData->perViewErrors, calibrationFlags, solverTermCrit);
+                }
+                dataController->updateUndistortMap();
+                dataController->printParametersToConsole(std::cout);
+                controller->updateState();
+                for(int j = 0; j < capParams.calibrationStep; j++)
+                    dataController->filterFrames();
+                static_cast<ShowProcessor*>(showProcessor.get())->updateBoardsView();
+            }
+            else if (exitStatus == DeleteLastFrame) {
+                deleteButton(0, &dataController);
+                static_cast<ShowProcessor*>(showProcessor.get())->updateBoardsView();
+            }
+            else if (exitStatus == DeleteAllFrames) {
+                deleteAllButton(0, &dataController);
+                static_cast<ShowProcessor*>(showProcessor.get())->updateBoardsView();
+            }
+            else if (exitStatus == SaveCurrentData) {
+                saveCurrentParamsButton(0, &dataController);
+            }
+            else if (exitStatus == SwitchUndistort)
+                static_cast<ShowProcessor*>(showProcessor.get())->switchUndistort();
+            else if (exitStatus == SwitchVisualisation)
+                static_cast<ShowProcessor*>(showProcessor.get())->switchVisualizationMode();
+
+            for (std::vector<cv::Ptr<FrameProcessor> >::iterator it = processors.begin(); it != processors.end(); ++it)
+                (*it)->resetState();
+        }
+    }
+    catch (std::runtime_error exp) {
+        std::cout << exp.what() << std::endl;
+    }
+
+    return 0;
+}
--- a/apps/interactive-calibration/parametersController.cpp
+++ b/apps/interactive-calibration/parametersController.cpp
@ -0,0 +1,138 @@
+#include "parametersController.hpp"
+#include <iostream>
+
+template <typename T>
+static bool readFromNode(cv::FileNode node, T& value)
+{
+    if(!node.isNone()) {
+        node >> value;
+        return true;
+    }
+    else
+        return false;
+}
+
+static bool checkAssertion(bool value, const std::string& msg)
+{
+    if(!value)
+        std::cerr << "Error: " << msg << std::endl;
+
+    return value;
+}
+
+bool calib::parametersController::loadFromFile(const std::string &inputFileName)
+{
+    cv::FileStorage reader;
+    reader.open(inputFileName, cv::FileStorage::READ);
+
+    if(!reader.isOpened()) {
+        std::cerr << "Warning: Unable to open " << inputFileName <<
+                     " Applicatioin stated with default advanced parameters" << std::endl;
+        return true;
+    }
+
+    readFromNode(reader["charuco_dict"], mCapParams.charucoDictName);
+    readFromNode(reader["charuco_square_lenght"], mCapParams.charucoSquareLenght);
+    readFromNode(reader["charuco_marker_size"], mCapParams.charucoMarkerSize);
+    readFromNode(reader["camera_resolution"], mCapParams.cameraResolution);
+    readFromNode(reader["calibration_step"], mCapParams.calibrationStep);
+    readFromNode(reader["max_frames_num"], mCapParams.maxFramesNum);
+    readFromNode(reader["min_frames_num"], mCapParams.minFramesNum);
+    readFromNode(reader["solver_eps"], mInternalParameters.solverEps);
+    readFromNode(reader["solver_max_iters"], mInternalParameters.solverMaxIters);
+    readFromNode(reader["fast_solver"], mInternalParameters.fastSolving);
+    readFromNode(reader["frame_filter_conv_param"], mInternalParameters.filterAlpha);
+
+    bool retValue =
+            checkAssertion(mCapParams.charucoDictName >= 0, "Dict name must be >= 0") &&
+            checkAssertion(mCapParams.charucoMarkerSize > 0, "Marker size must be positive") &&
+            checkAssertion(mCapParams.charucoSquareLenght > 0, "Square size must be positive") &&
+            checkAssertion(mCapParams.minFramesNum > 1, "Minimal number of frames for calibration < 1") &&
+            checkAssertion(mCapParams.calibrationStep > 0, "Calibration step must be positive") &&
+            checkAssertion(mCapParams.maxFramesNum > mCapParams.minFramesNum, "maxFramesNum < minFramesNum") &&
+            checkAssertion(mInternalParameters.solverEps > 0, "Solver precision must be positive") &&
+            checkAssertion(mInternalParameters.solverMaxIters > 0, "Max solver iterations number must be positive") &&
+            checkAssertion(mInternalParameters.filterAlpha >=0 && mInternalParameters.filterAlpha <=1 ,
+                           "Frame filter convolution parameter must be in [0,1] interval") &&
+            checkAssertion(mCapParams.cameraResolution.width > 0 && mCapParams.cameraResolution.height > 0,
+                           "Wrong camera resolution values");
+
+    reader.release();
+    return retValue;
+}
+
+calib::parametersController::parametersController()
+{
+}
+
+calib::captureParameters calib::parametersController::getCaptureParameters() const
+{
+    return mCapParams;
+}
+
+calib::internalParameters calib::parametersController::getInternalParameters() const
+{
+    return mInternalParameters;
+}
+
+bool calib::parametersController::loadFromParser(cv::CommandLineParser &parser)
+{
+    mCapParams.flipVertical = parser.get<bool>("flip");
+    mCapParams.captureDelay = parser.get<float>("d");
+    mCapParams.squareSize = parser.get<float>("sz");
+    mCapParams.templDst = parser.get<float>("dst");
+
+    if(!checkAssertion(mCapParams.squareSize > 0, "Distance between corners or circles must be positive"))
+        return false;
+    if(!checkAssertion(mCapParams.templDst > 0, "Distance betwen parts of dual template must be positive"))
+        return false;
+
+    if (parser.has("v")) {
+        mCapParams.source = File;
+        mCapParams.videoFileName = parser.get<std::string>("v");
+    }
+    else {
+        mCapParams.source = Camera;
+        mCapParams.camID = parser.get<int>("ci");
+    }
+
+    std::string templateType = parser.get<std::string>("t");
+
+    if(templateType.find("circles", 0) == 0) {
+        mCapParams.board = AcirclesGrid;
+        mCapParams.boardSize = cv::Size(4, 11);
+    }
+    else if(templateType.find("chessboard", 0) == 0) {
+        mCapParams.board = Chessboard;
+        mCapParams.boardSize = cv::Size(7, 7);
+    }
+    else if(templateType.find("dualcircles", 0) == 0) {
+        mCapParams.board = DoubleAcirclesGrid;
+        mCapParams.boardSize = cv::Size(4, 11);
+    }
+    else if(templateType.find("charuco", 0) == 0) {
+        mCapParams.board = chAruco;
+        mCapParams.boardSize = cv::Size(6, 8);
+        mCapParams.charucoDictName = 0;
+        mCapParams.charucoSquareLenght = 200;
+        mCapParams.charucoMarkerSize = 100;
+    }
+    else {
+        std::cerr << "Wrong template name\n";
+        return false;
+    }
+
+    if(parser.has("w") && parser.has("h")) {
+        mCapParams.boardSize = cv::Size(parser.get<int>("w"), parser.get<int>("h"));
+        if(!checkAssertion(mCapParams.boardSize.width > 0 || mCapParams.boardSize.height > 0,
+                           "Board size must be positive"))
+            return false;
+    }
+
+    if(!checkAssertion(parser.get<std::string>("of").find(".xml") > 0,
+                       "Wrong output file name: correct format is [name].xml"))
+        return false;
+
+    loadFromFile(parser.get<std::string>("pf"));
+    return true;
+}
--- a/apps/interactive-calibration/parametersController.hpp
+++ b/apps/interactive-calibration/parametersController.hpp
@ -0,0 +1,29 @@
+#ifndef PARAMETERS_CONTROLLER_HPP
+#define PARAMETERS_CONTROLLER_HPP
+
+#include <string>
+#include <opencv2/core.hpp>
+#include "calibCommon.hpp"
+
+namespace calib {
+
+class parametersController
+{
+protected:
+    captureParameters mCapParams;
+    internalParameters mInternalParameters;
+
+    bool loadFromFile(const std::string& inputFileName);
+public:
+    parametersController();
+    parametersController(cv::Ptr<captureParameters> params);
+
+    captureParameters getCaptureParameters() const;
+    internalParameters getInternalParameters() const;
+
+    bool loadFromParser(cv::CommandLineParser& parser);
+};
+
+}
+
+#endif
--- a/apps/interactive-calibration/rotationConverters.cpp
+++ b/apps/interactive-calibration/rotationConverters.cpp
@ -0,0 +1,121 @@
+#include "rotationConverters.hpp"
+
+#include <cmath>
+#include <opencv2/calib3d.hpp>
+#include <opencv2/core.hpp>
+
+#define CALIB_PI 3.14159265358979323846
+#define CALIB_PI_2 1.57079632679489661923
+
+void calib::Euler(const cv::Mat& src, cv::Mat& dst, int argType)
+{
+    if((src.rows == 3) && (src.cols == 3))
+    {
+        //convert rotaion matrix to 3 angles (pitch, yaw, roll)
+        dst = cv::Mat(3, 1, CV_64F);
+        double pitch, yaw, roll;
+
+        if(src.at<double>(0,2) < -0.998)
+        {
+            pitch = -atan2(src.at<double>(1,0), src.at<double>(1,1));
+            yaw = -CALIB_PI_2;
+            roll = 0.;
+        }
+        else if(src.at<double>(0,2) > 0.998)
+        {
+            pitch = atan2(src.at<double>(1,0), src.at<double>(1,1));
+            yaw = CALIB_PI_2;
+            roll = 0.;
+        }
+        else
+        {
+            pitch = atan2(-src.at<double>(1,2), src.at<double>(2,2));
+            yaw = asin(src.at<double>(0,2));
+            roll = atan2(-src.at<double>(0,1), src.at<double>(0,0));
+        }
+
+        if(argType == CALIB_DEGREES)
+        {
+            pitch *= 180./CALIB_PI;
+            yaw *= 180./CALIB_PI;
+            roll *= 180./CALIB_PI;
+        }
+        else if(argType != CALIB_RADIANS)
+            CV_Error(cv::Error::StsBadFlag, "Invalid argument type");
+
+        dst.at<double>(0,0) = pitch;
+        dst.at<double>(1,0) = yaw;
+        dst.at<double>(2,0) = roll;
+    }
+    else if( (src.cols == 1 && src.rows == 3) ||
+             (src.cols == 3 && src.rows == 1 ) )
+    {
+        //convert vector which contains 3 angles (pitch, yaw, roll) to rotaion matrix
+        double pitch, yaw, roll;
+        if(src.cols == 1 && src.rows == 3)
+        {
+            pitch = src.at<double>(0,0);
+            yaw = src.at<double>(1,0);
+            roll = src.at<double>(2,0);
+        }
+        else{
+            pitch = src.at<double>(0,0);
+            yaw = src.at<double>(0,1);
+            roll = src.at<double>(0,2);
+        }
+
+        if(argType == CALIB_DEGREES)
+        {
+            pitch *= CALIB_PI / 180.;
+            yaw *= CALIB_PI / 180.;
+            roll *= CALIB_PI / 180.;
+        }
+        else if(argType != CALIB_RADIANS)
+            CV_Error(cv::Error::StsBadFlag, "Invalid argument type");
+
+        dst = cv::Mat(3, 3, CV_64F);
+        cv::Mat M(3, 3, CV_64F);
+        cv::Mat i = cv::Mat::eye(3, 3, CV_64F);
+        i.copyTo(dst);
+        i.copyTo(M);
+
+        double* pR = dst.ptr<double>();
+        pR[4] = cos(pitch);
+        pR[7] = sin(pitch);
+        pR[8] = pR[4];
+        pR[5] = -pR[7];
+
+        double* pM = M.ptr<double>();
+        pM[0] = cos(yaw);
+        pM[2] = sin(yaw);
+        pM[8] = pM[0];
+        pM[6] = -pM[2];
+
+        dst *= M;
+        i.copyTo(M);
+        pM[0] = cos(roll);
+        pM[3] = sin(roll);
+        pM[4] = pM[0];
+        pM[1] = -pM[3];
+
+        dst *= M;
+    }
+    else
+        CV_Error(cv::Error::StsBadFlag, "Input matrix must be 1x3, 3x1 or 3x3" );
+}
+
+void calib::RodriguesToEuler(const cv::Mat& src, cv::Mat& dst, int argType)
+{
+    CV_Assert((src.cols == 1 && src.rows == 3) || (src.cols == 3 && src.rows == 1));
+    cv::Mat R;
+    cv::Rodrigues(src, R);
+    Euler(R, dst, argType);
+}
+
+void calib::EulerToRodrigues(const cv::Mat& src, cv::Mat& dst, int argType)
+{
+    CV_Assert((src.cols == 1 && src.rows == 3) || (src.cols == 3 && src.rows == 1));
+    cv::Mat R;
+    Euler(src, R, argType);
+    cv::Rodrigues(R, dst);
+}
--- a/apps/interactive-calibration/rotationConverters.hpp
+++ b/apps/interactive-calibration/rotationConverters.hpp
@ -0,0 +1,16 @@
+#ifndef RAOTATION_CONVERTERS_HPP
+#define RAOTATION_CONVERTERS_HPP
+
+#include <opencv2/core.hpp>
+
+namespace calib
+{
+#define CALIB_RADIANS 0
+#define CALIB_DEGREES 1
+
+    void Euler(const cv::Mat& src, cv::Mat& dst, int argType = CALIB_RADIANS);
+    void RodriguesToEuler(const cv::Mat& src, cv::Mat& dst, int argType = CALIB_RADIANS);
+    void EulerToRodrigues(const cv::Mat& src, cv::Mat& dst, int argType = CALIB_RADIANS);
+
+}
+#endif
--- a/apps/traincascade/CMakeLists.txt
+++ b/apps/traincascade/CMakeLists.txt
@ -23,7 +23,6 @@ set_target_properties(${the_target} PROPERTIES
                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
-                      INSTALL_NAME_DIR lib
                      OUTPUT_NAME "opencv_traincascade")

 if(ENABLE_SOLUTION_FOLDERS)
--- a/apps/traincascade/cascadeclassifier.cpp
+++ b/apps/traincascade/cascadeclassifier.cpp
@ -190,6 +190,7 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
    cascadeParams.printAttrs();
    stageParams->printAttrs();
    featureParams->printAttrs();
+    cout << "Number of unique features given windowSize [" << _cascadeParams.winSize.width << "," << _cascadeParams.winSize.height << "] : " << featureEvaluator->getNumFeatures() << "" << endl;

    int startNumStages = (int)stageClassifiers.size();
    if ( startNumStages > 1 )
@ -289,7 +290,7 @@ int CvCascadeClassifier::predict( int sampleIdx )
 {
    CV_DbgAssert( sampleIdx < numPos + numNeg );
    for (vector< Ptr<CvCascadeBoost> >::iterator it = stageClassifiers.begin();
-        it != stageClassifiers.end(); it++ )
+        it != stageClassifiers.end();++it )
    {
        if ( (*it)->predict( sampleIdx ) == 0.f )
            return 0;
@ -364,7 +365,7 @@ void CvCascadeClassifier::writeStages( FileStorage &fs, const Mat& featureMap )
    int i = 0;
    fs << CC_STAGES << "[";
    for( vector< Ptr<CvCascadeBoost> >::const_iterator it = stageClassifiers.begin();
-        it != stageClassifiers.end(); it++, i++ )
+        it != stageClassifiers.end();++it, ++i )
    {
        sprintf( cmnt, "stage %d", i );
        cvWriteComment( fs.fs, cmnt, 0 );
@ -556,7 +557,7 @@ void CvCascadeClassifier::getUsedFeaturesIdxMap( Mat& featureMap )
    featureMap.setTo(Scalar(-1));

    for( vector< Ptr<CvCascadeBoost> >::const_iterator it = stageClassifiers.begin();
-        it != stageClassifiers.end(); it++ )
+        it != stageClassifiers.end();++it )
        (*it)->markUsedFeaturesInMap( featureMap );

    for( int fi = 0, idx = 0; fi < varCount; fi++ )
--- a/apps/traincascade/imagestorage.cpp
+++ b/apps/traincascade/imagestorage.cpp
@ -28,7 +28,7 @@ CvCascadeImageReader::NegReader::NegReader()

 bool CvCascadeImageReader::NegReader::create( const string _filename, Size _winSize )
 {
-    string dirname, str;
+    string str;
    std::ifstream file(_filename.c_str());
    if ( !file.is_open() )
        return false;
--- a/apps/visualisation/CMakeLists.txt
+++ b/apps/visualisation/CMakeLists.txt
@ -0,0 +1,37 @@
+SET(OPENCV_VISUALISATION_DEPS opencv_core opencv_highgui opencv_imgproc opencv_videoio opencv_imgcodecs)
+ocv_check_dependencies(${OPENCV_VISUALISATION_DEPS})
+
+if(NOT OCV_DEPENDENCIES_FOUND)
+   return()
+endif()
+
+project(visualisation)
+set(the_target opencv_visualisation)
+
+ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
+ocv_target_include_modules_recurse(${the_target} ${OPENCV_VISUALISATION_DEPS})
+
+file(GLOB SRCS *.cpp)
+
+set(visualisation_files ${SRCS})
+ocv_add_executable(${the_target} ${visualisation_files})
+ocv_target_link_libraries(${the_target} ${OPENCV_VISUALISATION_DEPS})
+
+set_target_properties(${the_target} PROPERTIES
+                      DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+                      ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
+                      RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
+                      INSTALL_NAME_DIR lib
+                      OUTPUT_NAME "opencv_visualisation")
+
+if(ENABLE_SOLUTION_FOLDERS)
+   set_target_properties(${the_target} PROPERTIES FOLDER "applications")
+endif()
+
+if(INSTALL_CREATE_DISTRIB)
+   if(BUILD_SHARED_LIBS)
+      install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
+   endif()
+else()
+   install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
+endif()
--- a/apps/visualisation/opencv_visualisation.cpp
+++ b/apps/visualisation/opencv_visualisation.cpp
@ -0,0 +1,362 @@
+////////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+////////////////////////////////////////////////////////////////////////////////////////
+
+/*****************************************************************************************************
+
+Software for visualising cascade classifier models trained by OpenCV and to get a better
+understanding of the used features.
+
+USAGE:
+./opencv_visualisation --model=<model.xml> --image=<ref.png> --data=<video output folder>
+
+LIMITS
+- Use an absolute path for the output folder to ensure the tool works
+- Only handles cascade classifier models
+- Handles stumps only for the moment
+- Needs a valid training/test sample window with the original model dimensions, passed as `ref.png`
+- Can handle HAAR and LBP features
+
+Created by: Puttemans Steven - April 2016
+*****************************************************************************************************/
+
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/videoio.hpp>
+
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+using namespace cv;
+
+struct rect_data{
+    int x;
+    int y;
+    int w;
+    int h;
+    float weight;
+};
+
+int main( int argc, const char** argv )
+{
+    CommandLineParser parser(argc, argv,
+        "{ help h usage ? |      | show this message }"
+        "{ image i        |      | (required) path to reference image }"
+        "{ model m        |      | (required) path to cascade xml file }"
+        "{ data d         |      | (optional) path to video output folder }"
+    );
+    // Read in the input arguments
+    if (parser.has("help")){
+        parser.printMessage();
+        return 0;
+    }
+    string model(parser.get<string>("model"));
+    string output_folder(parser.get<string>("data"));
+    string image_ref = (parser.get<string>("image"));
+    if (model.empty() || image_ref.empty()){
+        parser.printMessage();
+        return -1;
+    }
+
+    // Value for timing
+    // You can increase this to have a better visualisation during the generation
+    int timing = 1;
+
+    // Value for cols of storing elements
+    int cols_prefered = 5;
+
+    // Open the XML model
+    FileStorage fs;
+    bool model_ok = fs.open(model, FileStorage::READ);
+    if (!model_ok){
+        cerr << "the cascade file '" << model << "' could not be loaded." << endl;
+        return  -1;
+    }
+    // Get a the required information
+    // First decide which feature type we are using
+    FileNode cascade = fs["cascade"];
+    string feature_type = cascade["featureType"];
+    bool haar = false, lbp = false;
+    if (feature_type.compare("HAAR") == 0){
+        haar = true;
+    }
+    if (feature_type.compare("LBP") == 0){
+        lbp = true;
+    }
+    if ( feature_type.compare("HAAR") != 0 && feature_type.compare("LBP")){
+        cerr << "The model is not an HAAR or LBP feature based model!" << endl;
+        cerr << "Please select a model that can be visualized by the software." << endl;
+        return -1;
+    }
+
+    // We make a visualisation mask - which increases the window to make it at least a bit more visible
+    int resize_factor = 10;
+    int resize_storage_factor = 10;
+    Mat reference_image = imread(image_ref, IMREAD_GRAYSCALE );
+    if (reference_image.empty()){
+        cerr << "the reference image '" << image_ref << "'' could not be loaded." << endl;
+        return -1;
+    }
+    Mat visualization;
+    resize(reference_image, visualization, Size(reference_image.cols * resize_factor, reference_image.rows * resize_factor));
+
+    // First recover for each stage the number of weak features and their index
+    // Important since it is NOT sequential when using LBP features
+    vector< vector<int> > stage_features;
+    FileNode stages = cascade["stages"];
+    FileNodeIterator it_stages = stages.begin(), it_stages_end = stages.end();
+    int idx = 0;
+    for( ; it_stages != it_stages_end; it_stages++, idx++ ){
+        vector<int> current_feature_indexes;
+        FileNode weak_classifiers = (*it_stages)["weakClassifiers"];
+        FileNodeIterator it_weak = weak_classifiers.begin(), it_weak_end = weak_classifiers.end();
+        vector<int> values;
+        for(int idy = 0; it_weak != it_weak_end; it_weak++, idy++ ){
+            (*it_weak)["internalNodes"] >> values;
+            current_feature_indexes.push_back( (int)values[2] );
+        }
+        stage_features.push_back(current_feature_indexes);
+    }
+
+    // If the output option has been chosen than we will store a combined image plane for
+    // each stage, containing all weak classifiers for that stage.
+    bool draw_planes = false;
+    stringstream output_video;
+    output_video << output_folder << "model_visualization.avi";
+    VideoWriter result_video;
+    if( output_folder.compare("") != 0 ){
+        draw_planes = true;
+        result_video.open(output_video.str(), VideoWriter::fourcc('X','V','I','D'), 15, Size(reference_image.cols * resize_factor, reference_image.rows * resize_factor), false);
+    }
+
+    if(haar){
+        // Grab the corresponding features dimensions and weights
+        FileNode features = cascade["features"];
+        vector< vector< rect_data > > feature_data;
+        FileNodeIterator it_features = features.begin(), it_features_end = features.end();
+        for(int idf = 0; it_features != it_features_end; it_features++, idf++ ){
+            vector< rect_data > current_feature_rectangles;
+            FileNode rectangles = (*it_features)["rects"];
+            int nrects = (int)rectangles.size();
+            for(int k = 0; k < nrects; k++){
+                rect_data current_data;
+                FileNode single_rect = rectangles[k];
+                current_data.x = (int)single_rect[0];
+                current_data.y = (int)single_rect[1];
+                current_data.w = (int)single_rect[2];
+                current_data.h = (int)single_rect[3];
+                current_data.weight = (float)single_rect[4];
+                current_feature_rectangles.push_back(current_data);
+            }
+            feature_data.push_back(current_feature_rectangles);
+        }
+
+        // Loop over each possible feature on its index, visualise on the mask and wait a bit,
+        // then continue to the next feature.
+        // If visualisations should be stored then do the in between calculations
+        Mat image_plane;
+        Mat metadata = Mat::zeros(150, 1000, CV_8UC1);
+        vector< rect_data > current_rects;
+        for(int sid = 0; sid < (int)stage_features.size(); sid ++){
+            if(draw_planes){
+                int features_nmbr = (int)stage_features[sid].size();
+                int cols = cols_prefered;
+                int rows = features_nmbr / cols;
+                if( (features_nmbr % cols) > 0){
+                    rows++;
+                }
+                image_plane = Mat::zeros(reference_image.rows * resize_storage_factor * rows, reference_image.cols * resize_storage_factor * cols, CV_8UC1);
+            }
+            for(int fid = 0; fid < (int)stage_features[sid].size(); fid++){
+                stringstream meta1, meta2;
+                meta1 << "Stage " << sid << " / Feature " << fid;
+                meta2 << "Rectangles: ";
+                Mat temp_window = visualization.clone();
+                Mat temp_metadata = metadata.clone();
+                int current_feature_index = stage_features[sid][fid];
+                current_rects = feature_data[current_feature_index];
+                Mat single_feature = reference_image.clone();
+                resize(single_feature, single_feature, Size(), resize_storage_factor, resize_storage_factor);
+                for(int i = 0; i < (int)current_rects.size(); i++){
+                    rect_data local = current_rects[i];
+                    if(draw_planes){
+                        if(local.weight >= 0){
+                            rectangle(single_feature, Rect(local.x * resize_storage_factor, local.y * resize_storage_factor, local.w * resize_storage_factor, local.h * resize_storage_factor), Scalar(0), FILLED);
+                        }else{
+                            rectangle(single_feature, Rect(local.x * resize_storage_factor, local.y * resize_storage_factor, local.w * resize_storage_factor, local.h * resize_storage_factor), Scalar(255), FILLED);
+                        }
+                    }
+                    Rect part(local.x * resize_factor, local.y * resize_factor, local.w * resize_factor, local.h * resize_factor);
+                    meta2 << part << " (w " << local.weight << ") ";
+                    if(local.weight >= 0){
+                        rectangle(temp_window, part, Scalar(0), FILLED);
+                    }else{
+                        rectangle(temp_window, part, Scalar(255), FILLED);
+                    }
+                }
+                imshow("features", temp_window);
+                putText(temp_window, meta1.str(), Point(15,15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
+                result_video.write(temp_window);
+                // Copy the feature image if needed
+                if(draw_planes){
+                    single_feature.copyTo(image_plane(Rect(0 + (fid%cols_prefered)*single_feature.cols, 0 + (fid/cols_prefered) * single_feature.rows, single_feature.cols, single_feature.rows)));
+                }
+                putText(temp_metadata, meta1.str(), Point(15,15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
+                putText(temp_metadata, meta2.str(), Point(15,40), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
+                imshow("metadata", temp_metadata);
+                waitKey(timing);
+            }
+            //Store the stage image if needed
+            if(draw_planes){
+                stringstream save_location;
+                save_location << output_folder << "stage_" << sid << ".png";
+                imwrite(save_location.str(), image_plane);
+            }
+        }
+    }
+
+    if(lbp){
+        // Grab the corresponding features dimensions and weights
+        FileNode features = cascade["features"];
+        vector<Rect> feature_data;
+        FileNodeIterator it_features = features.begin(), it_features_end = features.end();
+        for(int idf = 0; it_features != it_features_end; it_features++, idf++ ){
+            FileNode rectangle = (*it_features)["rect"];
+            Rect current_feature ((int)rectangle[0], (int)rectangle[1], (int)rectangle[2], (int)rectangle[3]);
+            feature_data.push_back(current_feature);
+        }
+
+        // Loop over each possible feature on its index, visualise on the mask and wait a bit,
+        // then continue to the next feature.
+        Mat image_plane;
+        Mat metadata = Mat::zeros(150, 1000, CV_8UC1);
+        for(int sid = 0; sid < (int)stage_features.size(); sid ++){
+            if(draw_planes){
+                int features_nmbr = (int)stage_features[sid].size();
+                int cols = cols_prefered;
+                int rows = features_nmbr / cols;
+                if( (features_nmbr % cols) > 0){
+                    rows++;
+                }
+                image_plane = Mat::zeros(reference_image.rows * resize_storage_factor * rows, reference_image.cols * resize_storage_factor * cols, CV_8UC1);
+            }
+            for(int fid = 0; fid < (int)stage_features[sid].size(); fid++){
+                stringstream meta1, meta2;
+                meta1 << "Stage " << sid << " / Feature " << fid;
+                meta2 << "Rectangle: ";
+                Mat temp_window = visualization.clone();
+                Mat temp_metadata = metadata.clone();
+                int current_feature_index = stage_features[sid][fid];
+                Rect current_rect = feature_data[current_feature_index];
+                Mat single_feature = reference_image.clone();
+                resize(single_feature, single_feature, Size(), resize_storage_factor, resize_storage_factor);
+
+                // VISUALISATION
+                // The rectangle is the top left one of a 3x3 block LBP constructor
+                Rect resized(current_rect.x * resize_factor, current_rect.y * resize_factor, current_rect.width * resize_factor, current_rect.height * resize_factor);
+                meta2 << resized;
+                // Top left
+                rectangle(temp_window, resized, Scalar(255), 1);
+                // Top middle
+                rectangle(temp_window, Rect(resized.x + resized.width, resized.y, resized.width, resized.height), Scalar(255), 1);
+                // Top right
+                rectangle(temp_window, Rect(resized.x + 2*resized.width, resized.y, resized.width, resized.height), Scalar(255), 1);
+                // Middle left
+                rectangle(temp_window, Rect(resized.x, resized.y + resized.height, resized.width, resized.height), Scalar(255), 1);
+                // Middle middle
+                rectangle(temp_window, Rect(resized.x + resized.width, resized.y + resized.height, resized.width, resized.height), Scalar(255), FILLED);
+                // Middle right
+                rectangle(temp_window, Rect(resized.x + 2*resized.width, resized.y + resized.height, resized.width, resized.height), Scalar(255), 1);
+                // Bottom left
+                rectangle(temp_window, Rect(resized.x, resized.y + 2*resized.height, resized.width, resized.height), Scalar(255), 1);
+                // Bottom middle
+                rectangle(temp_window, Rect(resized.x + resized.width, resized.y + 2*resized.height, resized.width, resized.height), Scalar(255), 1);
+                // Bottom right
+                rectangle(temp_window, Rect(resized.x + 2*resized.width, resized.y + 2*resized.height, resized.width, resized.height), Scalar(255), 1);
+
+                if(draw_planes){
+                    Rect resized_inner(current_rect.x * resize_storage_factor, current_rect.y * resize_storage_factor, current_rect.width * resize_storage_factor, current_rect.height * resize_storage_factor);
+                    // Top left
+                    rectangle(single_feature, resized_inner, Scalar(255), 1);
+                    // Top middle
+                    rectangle(single_feature, Rect(resized_inner.x + resized_inner.width, resized_inner.y, resized_inner.width, resized_inner.height), Scalar(255), 1);
+                    // Top right
+                    rectangle(single_feature, Rect(resized_inner.x + 2*resized_inner.width, resized_inner.y, resized_inner.width, resized_inner.height), Scalar(255), 1);
+                    // Middle left
+                    rectangle(single_feature, Rect(resized_inner.x, resized_inner.y + resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
+                    // Middle middle
+                    rectangle(single_feature, Rect(resized_inner.x + resized_inner.width, resized_inner.y + resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), FILLED);
+                    // Middle right
+                    rectangle(single_feature, Rect(resized_inner.x + 2*resized_inner.width, resized_inner.y + resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
+                    // Bottom left
+                    rectangle(single_feature, Rect(resized_inner.x, resized_inner.y + 2*resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
+                    // Bottom middle
+                    rectangle(single_feature, Rect(resized_inner.x + resized_inner.width, resized_inner.y + 2*resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
+                    // Bottom right
+                    rectangle(single_feature, Rect(resized_inner.x + 2*resized_inner.width, resized_inner.y + 2*resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
+
+                    single_feature.copyTo(image_plane(Rect(0 + (fid%cols_prefered)*single_feature.cols, 0 + (fid/cols_prefered) * single_feature.rows, single_feature.cols, single_feature.rows)));
+                }
+
+                putText(temp_metadata, meta1.str(), Point(15,15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
+                putText(temp_metadata, meta2.str(), Point(15,40), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
+                imshow("metadata", temp_metadata);
+                imshow("features", temp_window);
+                putText(temp_window, meta1.str(), Point(15,15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
+                result_video.write(temp_window);
+
+                waitKey(timing);
+            }
+
+            //Store the stage image if needed
+            if(draw_planes){
+                stringstream save_location;
+                save_location << output_folder << "stage_" << sid << ".png";
+                imwrite(save_location.str(), image_plane);
+            }
+        }
+    }
+    return 0;
+}
--- a/cmake/OpenCVDetectAndroidSDK.cmake
+++ b/cmake/OpenCVDetectAndroidSDK.cmake
@ -287,7 +287,7 @@ macro(add_android_project target path)
          set(android_proj_NATIVE_DEPS ${android_proj_NATIVE_DEPS} android)
        endif()

-        add_library(${JNI_LIB_NAME} MODULE ${android_proj_jni_files})
+        add_library(${JNI_LIB_NAME} SHARED ${android_proj_jni_files})
        ocv_target_include_modules_recurse(${JNI_LIB_NAME} ${android_proj_NATIVE_DEPS})
        ocv_target_include_directories(${JNI_LIB_NAME} "${path}/jni")
        ocv_target_link_libraries(${JNI_LIB_NAME} ${OPENCV_LINKER_LIBS} ${android_proj_NATIVE_DEPS})
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -33,12 +33,17 @@ if(CUDA_FOUND)
    if(WIN32)
      find_cuda_helper_libs(nvcuvenc)
    endif()
-    set(HAVE_NVCUVID 1)
+    if(CUDA_nvcuvid_LIBRARY)
+      set(HAVE_NVCUVID 1)
+    endif()
+    if(CUDA_nvcuvenc_LIBRARY)
+      set(HAVE_NVCUVENC 1)
+    endif()
  endif()

  message(STATUS "CUDA detected: " ${CUDA_VERSION})

-  set(_generations "Fermi" "Kepler")
+  set(_generations "Fermi" "Kepler" "Maxwell" "Pascal")
  if(NOT CMAKE_CROSSCOMPILING)
    list(APPEND _generations "Auto")
  endif()
@ -58,13 +63,13 @@ if(CUDA_FOUND)

  set(__cuda_arch_ptx "")
  if(CUDA_GENERATION STREQUAL "Fermi")
-    set(__cuda_arch_bin "2.0 2.1(2.0)")
+    set(__cuda_arch_bin "2.0")
  elseif(CUDA_GENERATION STREQUAL "Kepler")
-    if(${CUDA_VERSION} VERSION_LESS "5.0")
-      set(__cuda_arch_bin "3.0")
-    else()
-      set(__cuda_arch_bin "3.0 3.5")
-    endif()
+    set(__cuda_arch_bin "3.0 3.5")
+  elseif(CUDA_GENERATION STREQUAL "Maxwell")
+    set(__cuda_arch_bin "5.0")
+  elseif(CUDA_GENERATION STREQUAL "Pascal")
+    set(__cuda_arch_bin "6.0")
  elseif(CUDA_GENERATION STREQUAL "Auto")
    execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run"
                     WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
@ -86,14 +91,12 @@ if(CUDA_FOUND)
      set(__cuda_arch_bin "5.3")
      set(__cuda_arch_ptx "")
    else()
-      if(${CUDA_VERSION} VERSION_LESS "5.0")
-        set(__cuda_arch_bin "1.1 1.2 1.3 2.0 2.1(2.0) 3.0")
-      elseif(${CUDA_VERSION} VERSION_GREATER "6.5")
-        set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5")
+      if(${CUDA_VERSION} VERSION_LESS "8.0")
+        set(__cuda_arch_bin "2.0 3.0 3.5 5.0")
      else()
-        set(__cuda_arch_bin "1.1 1.2 1.3 2.0 2.1(2.0) 3.0 3.5")
+        set(__cuda_arch_bin "2.0 3.0 3.5 5.0 6.0")
      endif()
-      set(__cuda_arch_ptx "3.0")
+      set(__cuda_arch_ptx "")
    endif()
  endif()

@ -133,6 +136,7 @@ if(CUDA_FOUND)
      set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
    endif()
  endforeach()
+  set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -D_FORCE_INLINES)

  # Tell NVCC to add PTX intermediate code for the specified architectures
  string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}")
--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@ -1,21 +1,21 @@
+set(OPENCL_FOUND ON CACHE BOOL "OpenCL library is found")
 if(APPLE)
-  set(OPENCL_FOUND YES)
  set(OPENCL_LIBRARY "-framework OpenCL" CACHE STRING "OpenCL library")
-  set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory")
-  mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
-  set(HAVE_OPENCL_STATIC ON)
+  set(OPENCL_INCLUDE_DIR "" CACHE PATH "OpenCL include directory")
 else(APPLE)
-  set(OPENCL_FOUND YES)
-  set(HAVE_OPENCL_STATIC OFF)
-  set(OPENCL_INCLUDE_DIR "${OpenCV_SOURCE_DIR}/3rdparty/include/opencl/1.2")
+  set(OPENCL_LIBRARY "" CACHE STRING "OpenCL library")
+  set(OPENCL_INCLUDE_DIR "${OpenCV_SOURCE_DIR}/3rdparty/include/opencl/1.2" CACHE PATH "OpenCL include directory")
 endif(APPLE)
-
-if(WINRT)
-  set(OPENCL_FOUND NO)
-  set(HAVE_OPENCL_STATIC OFF)
-endif(WINRT)
+mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)

 if(OPENCL_FOUND)
+  if(OPENCL_LIBRARY)
+    set(HAVE_OPENCL_STATIC ON)
+    set(OPENCL_LIBRARIES "${OPENCL_LIBRARY}")
+  else()
+    set(HAVE_OPENCL_STATIC OFF)
+  endif()
+
  if(NOT HAVE_OPENCL_STATIC)
    try_compile(__VALID_OPENCL
      "${OpenCV_BINARY_DIR}"
@ -29,20 +29,12 @@ if(OPENCL_FOUND)
    endif()
  endif()

-  if(NOT WINRT)
-    set(HAVE_OPENCL 1)
-  endif()
+  set(HAVE_OPENCL 1)

  if(WITH_OPENCL_SVM)
    set(HAVE_OPENCL_SVM 1)
  endif()

-  if(HAVE_OPENCL_STATIC)
-    set(OPENCL_LIBRARIES "${OPENCL_LIBRARY}")
-  else()
-    unset(OPENCL_LIBRARIES)
-  endif()
-
  set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})

  if(WITH_OPENCLAMDFFT)
--- a/Show More
+++ b/Show More