From 5877debb6f6b2599c7f2cc9e1ce5ad9d4931cd9b Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Tue, 22 Mar 2016 16:52:23 +0300
Subject: [PATCH 1/7] HAL resize, warpAffine, warpPerspective interface

- added HAL documentation support
- added documentation to HAL replacement interface
- updated several HAL functions in imgproc module
---
 doc/CMakeLists.txt                            |   8 +-
 modules/core/include/opencv2/core/cvdef.h     |  61 ----
 modules/core/include/opencv2/core/hal/hal.hpp |  14 +-
 .../core/include/opencv2/core/hal/interface.h | 121 +++++--
 modules/core/src/arithm.cpp                   |  44 +--
 modules/core/src/arithm_core.hpp              |   4 +-
 modules/core/src/hal_replacement.hpp          | 329 +++++++++++++-----
 modules/imgproc/include/opencv2/imgproc.hpp   |   5 +
 .../include/opencv2/imgproc/hal/hal.hpp       |  23 +-
 .../include/opencv2/imgproc/hal/interface.h   |  26 ++
 modules/imgproc/src/hal_replacement.hpp       | 298 +++++++++++++++-
 modules/imgproc/src/imgwarp.cpp               | 228 +++++++-----
 modules/imgproc/src/morph.cpp                 |  16 +-
 13 files changed, 866 insertions(+), 311 deletions(-)
 create mode 100644 modules/imgproc/include/opencv2/imgproc/hal/interface.h

diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index c8ee7630e..ef579fb97 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -42,6 +42,7 @@ if(BUILD_DOCS AND DOXYGEN_FOUND)
   set(paths_bib)
   set(paths_sample)
   set(paths_tutorial)
+  set(paths_hal_interface)
   set(refs_main)
   set(refs_extra)
   set(deps)
@@ -87,6 +88,11 @@ if(BUILD_DOCS AND DOXYGEN_FOUND)
           file(APPEND "${tutorial_contrib_root}" "- ${m}. @subpage ${tutorial_id}\n")
         endforeach()
       endif()
+      # HAL replacement file
+      set(replacement_header "${OPENCV_MODULE_opencv_${m}_LOCATION}/src/hal_replacement.hpp")
+      if(EXISTS "${replacement_header}")
+        list(APPEND paths_hal_interface "${replacement_header}")
+      endif()
 
       # BiBTeX file
       set(bib_file "${docs_dir}/${m}.bib")
@@ -131,7 +137,7 @@ if(BUILD_DOCS AND DOXYGEN_FOUND)
   set(example_path "${CMAKE_SOURCE_DIR}/samples")
 
   # set export variables
-  string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_INPUT_LIST "${rootfile} ; ${faqfile} ; ${paths_include} ; ${paths_doc} ; ${tutorial_path} ; ${tutorial_py_path} ; ${paths_tutorial} ; ${tutorial_contrib_root}")
+  string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_INPUT_LIST "${rootfile} ; ${faqfile} ; ${paths_include} ; ${paths_hal_interface} ; ${paths_doc} ; ${tutorial_path} ; ${tutorial_py_path} ; ${paths_tutorial} ; ${tutorial_contrib_root}")
   string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_IMAGE_PATH "${paths_doc} ; ${tutorial_path} ; ${tutorial_py_path} ; ${paths_tutorial}")
   # TODO: remove paths_doc from EXAMPLE_PATH after face module tutorials/samples moved to separate folders
   string(REPLACE ";" " \\\n" CMAKE_DOXYGEN_EXAMPLE_PATH  "${example_path} ; ${paths_doc} ; ${paths_sample}")
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index af2abfbb2..c00591419 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -357,67 +357,6 @@ Cv64suf;
 *                                  Matrix type (Mat)                                     *
 \****************************************************************************************/
 
-#define CV_CN_MAX     512
-#define CV_CN_SHIFT   3
-#define CV_DEPTH_MAX  (1 << CV_CN_SHIFT)
-
-#define CV_8U   0
-#define CV_8S   1
-#define CV_16U  2
-#define CV_16S  3
-#define CV_32S  4
-#define CV_32F  5
-#define CV_64F  6
-#define CV_USRTYPE1 7
-
-#define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
-#define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
-
-#define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
-#define CV_MAKE_TYPE CV_MAKETYPE
-
-#define CV_8UC1 CV_MAKETYPE(CV_8U,1)
-#define CV_8UC2 CV_MAKETYPE(CV_8U,2)
-#define CV_8UC3 CV_MAKETYPE(CV_8U,3)
-#define CV_8UC4 CV_MAKETYPE(CV_8U,4)
-#define CV_8UC(n) CV_MAKETYPE(CV_8U,(n))
-
-#define CV_8SC1 CV_MAKETYPE(CV_8S,1)
-#define CV_8SC2 CV_MAKETYPE(CV_8S,2)
-#define CV_8SC3 CV_MAKETYPE(CV_8S,3)
-#define CV_8SC4 CV_MAKETYPE(CV_8S,4)
-#define CV_8SC(n) CV_MAKETYPE(CV_8S,(n))
-
-#define CV_16UC1 CV_MAKETYPE(CV_16U,1)
-#define CV_16UC2 CV_MAKETYPE(CV_16U,2)
-#define CV_16UC3 CV_MAKETYPE(CV_16U,3)
-#define CV_16UC4 CV_MAKETYPE(CV_16U,4)
-#define CV_16UC(n) CV_MAKETYPE(CV_16U,(n))
-
-#define CV_16SC1 CV_MAKETYPE(CV_16S,1)
-#define CV_16SC2 CV_MAKETYPE(CV_16S,2)
-#define CV_16SC3 CV_MAKETYPE(CV_16S,3)
-#define CV_16SC4 CV_MAKETYPE(CV_16S,4)
-#define CV_16SC(n) CV_MAKETYPE(CV_16S,(n))
-
-#define CV_32SC1 CV_MAKETYPE(CV_32S,1)
-#define CV_32SC2 CV_MAKETYPE(CV_32S,2)
-#define CV_32SC3 CV_MAKETYPE(CV_32S,3)
-#define CV_32SC4 CV_MAKETYPE(CV_32S,4)
-#define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))
-
-#define CV_32FC1 CV_MAKETYPE(CV_32F,1)
-#define CV_32FC2 CV_MAKETYPE(CV_32F,2)
-#define CV_32FC3 CV_MAKETYPE(CV_32F,3)
-#define CV_32FC4 CV_MAKETYPE(CV_32F,4)
-#define CV_32FC(n) CV_MAKETYPE(CV_32F,(n))
-
-#define CV_64FC1 CV_MAKETYPE(CV_64F,1)
-#define CV_64FC2 CV_MAKETYPE(CV_64F,2)
-#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
-#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
-#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
-
 #define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
 #define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
 #define CV_MAT_TYPE_MASK        (CV_DEPTH_MAX*CV_CN_MAX - 1)
diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 118913eb7..64af09ab8 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -171,13 +171,13 @@ CV_EXPORTS void div32s( const int* src1, size_t step1, const int* src2, size_t s
 CV_EXPORTS void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
 CV_EXPORTS void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
 
-CV_EXPORTS void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
-CV_EXPORTS void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
-CV_EXPORTS void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
-CV_EXPORTS void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
-CV_EXPORTS void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
-CV_EXPORTS void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
-CV_EXPORTS void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip8u( const uchar *, size_t, const uchar * src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip8s( const schar *, size_t, const schar * src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip16u( const ushort *, size_t, const ushort * src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip16s( const short *, size_t, const short * src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip32s( const int *, size_t, const int * src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip32f( const float *, size_t, const float * src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip64f( const double *, size_t, const double * src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
 
 CV_EXPORTS void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
 CV_EXPORTS void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
diff --git a/modules/core/include/opencv2/core/hal/interface.h b/modules/core/include/opencv2/core/hal/interface.h
index 51f760610..ba3f55c33 100644
--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -1,19 +1,16 @@
-#ifndef _HAL_INTERFACE_HPP_INCLUDED_
-#define _HAL_INTERFACE_HPP_INCLUDED_
+#ifndef OPENCV_CORE_HAL_INTERFACE_H
+#define OPENCV_CORE_HAL_INTERFACE_H
 
 //! @addtogroup core_hal_interface
 //! @{
 
+//! @name Return codes
+//! @{
 #define CV_HAL_ERROR_OK 0
 #define CV_HAL_ERROR_NOT_IMPLEMENTED 1
 #define CV_HAL_ERROR_UNKNOWN -1
+//! @}
 
-#define CV_HAL_CMP_EQ 0
-#define CV_HAL_CMP_GT 1
-#define CV_HAL_CMP_GE 2
-#define CV_HAL_CMP_LT 3
-#define CV_HAL_CMP_LE 4
-#define CV_HAL_CMP_NE 5
 
 #ifdef __cplusplus
 #include <cstddef>
@@ -21,18 +18,17 @@
 #include <stddef.h>
 #endif
 
-/* primitive types */
-/*
-  schar  - signed 1 byte integer
-  uchar  - unsigned 1 byte integer
-  short  - signed 2 byte integer
-  ushort - unsigned 2 byte integer
-  int    - signed 4 byte integer
-  uint   - unsigned 4 byte integer
-  int64  - signed 8 byte integer
-  uint64 - unsigned 8 byte integer
-*/
-
+//! @name Data types
+//! primitive types
+//! - schar  - signed 1 byte integer
+//! - uchar  - unsigned 1 byte integer
+//! - short  - signed 2 byte integer
+//! - ushort - unsigned 2 byte integer
+//! - int    - signed 4 byte integer
+//! - uint   - unsigned 4 byte integer
+//! - int64  - signed 8 byte integer
+//! - uint64 - unsigned 8 byte integer
+//! @{
 #if !defined _MSC_VER && !defined __BORLANDC__
 #  if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
 #    include <cstdint>
@@ -64,6 +60,91 @@ typedef signed char schar;
 #  define CV_BIG_UINT(n)  n##ULL
 #endif
 
+#define CV_CN_MAX     512
+#define CV_CN_SHIFT   3
+#define CV_DEPTH_MAX  (1 << CV_CN_SHIFT)
+
+#define CV_8U   0
+#define CV_8S   1
+#define CV_16U  2
+#define CV_16S  3
+#define CV_32S  4
+#define CV_32F  5
+#define CV_64F  6
+#define CV_USRTYPE1 7
+
+#define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
+#define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
+
+#define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
+#define CV_MAKE_TYPE CV_MAKETYPE
+
+#define CV_8UC1 CV_MAKETYPE(CV_8U,1)
+#define CV_8UC2 CV_MAKETYPE(CV_8U,2)
+#define CV_8UC3 CV_MAKETYPE(CV_8U,3)
+#define CV_8UC4 CV_MAKETYPE(CV_8U,4)
+#define CV_8UC(n) CV_MAKETYPE(CV_8U,(n))
+
+#define CV_8SC1 CV_MAKETYPE(CV_8S,1)
+#define CV_8SC2 CV_MAKETYPE(CV_8S,2)
+#define CV_8SC3 CV_MAKETYPE(CV_8S,3)
+#define CV_8SC4 CV_MAKETYPE(CV_8S,4)
+#define CV_8SC(n) CV_MAKETYPE(CV_8S,(n))
+
+#define CV_16UC1 CV_MAKETYPE(CV_16U,1)
+#define CV_16UC2 CV_MAKETYPE(CV_16U,2)
+#define CV_16UC3 CV_MAKETYPE(CV_16U,3)
+#define CV_16UC4 CV_MAKETYPE(CV_16U,4)
+#define CV_16UC(n) CV_MAKETYPE(CV_16U,(n))
+
+#define CV_16SC1 CV_MAKETYPE(CV_16S,1)
+#define CV_16SC2 CV_MAKETYPE(CV_16S,2)
+#define CV_16SC3 CV_MAKETYPE(CV_16S,3)
+#define CV_16SC4 CV_MAKETYPE(CV_16S,4)
+#define CV_16SC(n) CV_MAKETYPE(CV_16S,(n))
+
+#define CV_32SC1 CV_MAKETYPE(CV_32S,1)
+#define CV_32SC2 CV_MAKETYPE(CV_32S,2)
+#define CV_32SC3 CV_MAKETYPE(CV_32S,3)
+#define CV_32SC4 CV_MAKETYPE(CV_32S,4)
+#define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))
+
+#define CV_32FC1 CV_MAKETYPE(CV_32F,1)
+#define CV_32FC2 CV_MAKETYPE(CV_32F,2)
+#define CV_32FC3 CV_MAKETYPE(CV_32F,3)
+#define CV_32FC4 CV_MAKETYPE(CV_32F,4)
+#define CV_32FC(n) CV_MAKETYPE(CV_32F,(n))
+
+#define CV_64FC1 CV_MAKETYPE(CV_64F,1)
+#define CV_64FC2 CV_MAKETYPE(CV_64F,2)
+#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
+#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
+#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
+//! @}
+
+//! @name Comparison operation
+//! @sa cv::CmpTypes
+//! @{
+#define CV_HAL_CMP_EQ 0
+#define CV_HAL_CMP_GT 1
+#define CV_HAL_CMP_GE 2
+#define CV_HAL_CMP_LT 3
+#define CV_HAL_CMP_LE 4
+#define CV_HAL_CMP_NE 5
+//! @}
+
+//! @name Border processing modes
+//! @sa cv::BorderTypes
+//! @{
+#define CV_HAL_BORDER_CONSTANT 0
+#define CV_HAL_BORDER_REPLICATE 1
+#define CV_HAL_BORDER_REFLECT 2
+#define CV_HAL_BORDER_WRAP 3
+#define CV_HAL_BORDER_REFLECT_101 4
+#define CV_HAL_BORDER_TRANSPARENT 5
+#define CV_HAL_BORDER_ISOLATED 16
+//! @}
+
 //! @}
 
 #endif
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index c3acca054..8ef3370bd 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -3123,7 +3123,7 @@ void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
     if( src1 )
         div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
     else
-        recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+        recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
 }
 
 void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
@@ -3172,53 +3172,53 @@ void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
 // Reciprocial
 //=======================================
 
-void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+void recip8u( const uchar*, size_t, const uchar* src2, size_t step2,
                   uchar* dst, size_t step, int width, int height, void* scale)
 {
-    CALL_HAL(recip8u, cv_hal_recip8u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+    CALL_HAL(recip8u, cv_hal_recip8u, src2, step2, dst, step, width, height, *(const double*)scale)
+    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
 }
 
-void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
+void recip8s( const schar*, size_t, const schar* src2, size_t step2,
                   schar* dst, size_t step, int width, int height, void* scale)
 {
-    CALL_HAL(recip8s, cv_hal_recip8s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+    CALL_HAL(recip8s, cv_hal_recip8s, src2, step2, dst, step, width, height, *(const double*)scale)
+    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
 }
 
-void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+void recip16u( const ushort*, size_t, const ushort* src2, size_t step2,
                    ushort* dst, size_t step, int width, int height, void* scale)
 {
-    CALL_HAL(recip16u, cv_hal_recip16u, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+    CALL_HAL(recip16u, cv_hal_recip16u, src2, step2, dst, step, width, height, *(const double*)scale)
+    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
 }
 
-void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
+void recip16s( const short*, size_t, const short* src2, size_t step2,
                    short* dst, size_t step, int width, int height, void* scale)
 {
-    CALL_HAL(recip16s, cv_hal_recip16s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+    CALL_HAL(recip16s, cv_hal_recip16s, src2, step2, dst, step, width, height, *(const double*)scale)
+    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
 }
 
-void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
+void recip32s( const int*, size_t, const int* src2, size_t step2,
                    int* dst, size_t step, int width, int height, void* scale)
 {
-    CALL_HAL(recip32s, cv_hal_recip32s, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+    CALL_HAL(recip32s, cv_hal_recip32s, src2, step2, dst, step, width, height, *(const double*)scale)
+    recip_i(src2, step2, dst, step, width, height, *(const double*)scale);
 }
 
-void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
+void recip32f( const float*, size_t, const float* src2, size_t step2,
                    float* dst, size_t step, int width, int height, void* scale)
 {
-    CALL_HAL(recip32f, cv_hal_recip32f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+    CALL_HAL(recip32f, cv_hal_recip32f, src2, step2, dst, step, width, height, *(const double*)scale)
+    recip_f(src2, step2, dst, step, width, height, *(const double*)scale);
 }
 
-void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
+void recip64f( const double*, size_t, const double* src2, size_t step2,
                    double* dst, size_t step, int width, int height, void* scale)
 {
-    CALL_HAL(recip64f, cv_hal_recip64f, src1, step1, src2, step2, dst, step, width, height, *(const double*)scale)
-    recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+    CALL_HAL(recip64f, cv_hal_recip64f, src2, step2, dst, step, width, height, *(const double*)scale)
+    recip_f(src2, step2, dst, step, width, height, *(const double*)scale);
 }
 
 //=======================================
diff --git a/modules/core/src/arithm_core.hpp b/modules/core/src/arithm_core.hpp
index 4790586eb..b92d47a81 100644
--- a/modules/core/src/arithm_core.hpp
+++ b/modules/core/src/arithm_core.hpp
@@ -528,7 +528,7 @@ div_f( const T* src1, size_t step1, const T* src2, size_t step2,
 }
 
 template<typename T> static void
-recip_i( const T*, size_t, const T* src2, size_t step2,
+recip_i( const T* src2, size_t step2,
          T* dst, size_t step, int width, int height, double scale )
 {
     step2 /= sizeof(src2[0]);
@@ -549,7 +549,7 @@ recip_i( const T*, size_t, const T* src2, size_t step2,
 }
 
 template<typename T> static void
-recip_f( const T*, size_t, const T* src2, size_t step2,
+recip_f( const T* src2, size_t step2,
          T* dst, size_t step, int width, int height, double scale )
 {
     T scale_f = (T)scale;
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index 65866f8bf..69345ca4a 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -42,51 +42,119 @@
 //
 //M*/
 
-#ifndef __OPENCV_CORE_HAL_REPLACEMENT_HPP__
-#define __OPENCV_CORE_HAL_REPLACEMENT_HPP__
+#ifndef OPENCV_CORE_HAL_REPLACEMENT_HPP
+#define OPENCV_CORE_HAL_REPLACEMENT_HPP
 
 #include "opencv2/core/hal/interface.h"
 
-inline int hal_ni_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_not8u(const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+#if defined __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-parameter"
+#elif defined _MSC_VER
+#  pragma warning( push )
+#  pragma warning( disable: 4100 )
+#endif
 
+//! @addtogroup core_hal_interface
+//! @note Define your functions to override default implementations:
+//! @code
+//! #undef hal_add8u
+//! #define hal_add8u my_add8u
+//! @endcode
+//! @{
+
+/**
+Add: _dst[i] = src1[i] + src2[i]_ @n
+Sub: _dst[i] = src1[i] - src2[i]_
+@param src1_data,src1_step first source image data and step
+@param src2_data,src2_step second source image data and step
+@param dst_data,dst_step destination image data and step
+@param width,height dimensions of the images
+*/
+//! @addtogroup core_hal_interface_addsub Element-wise add and subtract
+//! @{
+inline int hal_ni_add8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+inline int hal_ni_sub8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+/**
+Minimum: _dst[i] = min(src1[i], src2[i])_ @n
+Maximum: _dst[i] = max(src1[i], src2[i])_
+@param src1_data,src1_step first source image data and step
+@param src2_data,src2_step second source image data and step
+@param dst_data,dst_step destination image data and step
+@param width,height dimensions of the images
+*/
+//! @addtogroup core_hal_interface_minmax Element-wise minimum or maximum
+//! @{
+inline int hal_ni_max8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+inline int hal_ni_min8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+/**
+Absolute difference: _dst[i] = | src1[i] - src2[i] |_
+@param src1_data,src1_step first source image data and step
+@param src2_data,src2_step second source image data and step
+@param dst_data,dst_step destination image data and step
+@param width,height dimensions of the images
+@param scale additional multiplier
+*/
+//! @addtogroup core_hal_interface_absdiff Element-wise absolute difference
+//! @{
+inline int hal_ni_absdiff8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+/**
+Bitwise AND: _dst[i] = src1[i] & src2[i]_ @n
+Bitwise OR: _dst[i] = src1[i] | src2[i]_ @n
+Bitwise XOR: _dst[i] = src1[i] ^ src2[i]_ @n
+Bitwise NOT: _dst[i] = !src[i]_
+@param src1_data,src1_step first source image data and step
+@param src2_data,src2_step second source image data and step
+@param dst_data,dst_step destination image data and step
+@param width,height dimensions of the images
+ */
+//! @addtogroup core_hal_interface_logical Bitwise logical operations
+//! @{
+inline int hal_ni_and8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_or8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_xor8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_not8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
 #define cv_hal_add8u hal_ni_add8u
 #define cv_hal_add8s hal_ni_add8s
 #define cv_hal_add16u hal_ni_add16u
@@ -126,15 +194,28 @@ inline int hal_ni_not8u(const uchar*, size_t, uchar*, size_t, int, int) { return
 #define cv_hal_or8u hal_ni_or8u
 #define cv_hal_xor8u hal_ni_xor8u
 #define cv_hal_not8u hal_ni_not8u
+//! @endcond
 
-inline int hal_ni_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+Compare: _dst[i] = src1[i] op src2[i]_
+@param src1_data,src1_step first source image data and step
+@param src2_data,src2_step second source image data and step
+@param dst_data,dst_step destination image data and step
+@param width,height dimensions of the images
+@param operation one of (CV_HAL_CMP_EQ, CV_HAL_CMP_GT, ...)
+*/
+//! @addtogroup core_hal_interface_compare Element-wise compare
+//! @{
+inline int hal_ni_cmp8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
 
+//! @cond IGNORED
 #define cv_hal_cmp8u hal_ni_cmp8u
 #define cv_hal_cmp8s hal_ni_cmp8s
 #define cv_hal_cmp16u hal_ni_cmp16u
@@ -142,29 +223,65 @@ inline int hal_ni_cmp64f(const double*, size_t, const double*, size_t, uchar*, s
 #define cv_hal_cmp32s hal_ni_cmp32s
 #define cv_hal_cmp32f hal_ni_cmp32f
 #define cv_hal_cmp64f hal_ni_cmp64f
+//! @endcond
 
-inline int hal_ni_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+Multiply: _dst[i] = scale * src1[i] * src2[i]_
+@param src1_data,src1_step first source image data and step
+@param src2_data,src2_step second source image data and step
+@param dst_data,dst_step destination image data and step
+@param width,height dimensions of the images
+@param scale additional multiplier
+*/
+//! @addtogroup core_hal_interface_multiply Element-wise multiply
+//! @{
+inline int hal_ni_mul8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
 
+/**
+Divide: _dst[i] = scale * src1[i] / src2[i]_
+@param src1_data,src1_step first source image data and step
+@param src2_data,src2_step second source image data and step
+@param dst_data,dst_step destination image data and step
+@param width,height dimensions of the images
+@param scale additional multiplier
+*/
+//! @addtogroup core_hal_interface_divide Element-wise divide
+//! @{
+inline int hal_ni_div8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+/**
+Computes reciprocial: _dst[i] = scale / src[i]_
+@param src_data,src_step source image data and step
+@param dst_data,dst_step destination image data and step
+@param width,height dimensions of the images
+@param scale additional multiplier
+ */
+//! @addtogroup core_hal_interface_reciprocial Element-wise reciprocial
+//! @{
+inline int hal_ni_recip8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip8s(const schar *src_data, size_t src_step, schar *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip16u(const ushort *src_data, size_t src_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip16s(const short *src_data, size_t src_step, short *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip32s(const int *src_data, size_t src_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
 #define cv_hal_mul8u hal_ni_mul8u
 #define cv_hal_mul8s hal_ni_mul8s
 #define cv_hal_mul16u hal_ni_mul16u
@@ -186,15 +303,28 @@ inline int hal_ni_recip64f(const double*, size_t, const double*, size_t, double*
 #define cv_hal_recip32s hal_ni_recip32s
 #define cv_hal_recip32f hal_ni_recip32f
 #define cv_hal_recip64f hal_ni_recip64f
+//! @endcond
 
-inline int hal_ni_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+Computes weighted sum of two arrays using formula: _dst[i] = a * src1[i] + b * src2[i] + c_
+@param src1_data,src1_step first source image data and step
+@param src2_data,src2_step second source image data and step
+@param dst_data,dst_step destination image data and step
+@param width,height dimensions of the images
+@param scalars numbers _a_, _b_, and _c_
+ */
+//! @addtogroup core_hal_interface_addWeighted Element-wise weighted sum
+//! @{
+inline int hal_ni_addWeighted8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, const double scalars[3]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, const double scalars[3]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, const double scalars[3]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, const double scalars[3]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, const double scalars[3]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, const double scalars[3]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, const double scalars[3]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
 
+//! @cond IGNORED
 #define cv_hal_addWeighted8u hal_ni_addWeighted8u
 #define cv_hal_addWeighted8s hal_ni_addWeighted8s
 #define cv_hal_addWeighted16u hal_ni_addWeighted16u
@@ -202,26 +332,57 @@ inline int hal_ni_addWeighted64f(const double*, size_t, const double*, size_t, d
 #define cv_hal_addWeighted32s hal_ni_addWeighted32s
 #define cv_hal_addWeighted32f hal_ni_addWeighted32f
 #define cv_hal_addWeighted64f hal_ni_addWeighted64f
+//! @endcond
 
-inline int hal_ni_split8u(const uchar*, uchar**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_split16u(const ushort*, ushort**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_split32s(const int*, int**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_split64s(const int64*, int64**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param src_data array of interleaved values (__len__ x __cn__ items) [ B, G, R, B, G, R, ...]
+@param dst_data array of pointers to destination arrays (__cn__ items x __len__ items) [ [B, B, ...], [G, G, ...], [R, R, ...] ]
+@param len number of elements
+@param cn number of channels
+ */
+//! @addtogroup core_hal_interface_split Channel split
+//! @{
+inline int hal_ni_split8u(const uchar *src_data, uchar **dst_data, int len, int cn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_split16u(const ushort *src_data, ushort **dst_data, int len, int cn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_split32s(const int *src_data, int **dst_data, int len, int cn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_split64s(const int64 *src_data, int64 **dst_data, int len, int cn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
 
+//! @cond IGNORED
 #define cv_hal_split8u hal_ni_split8u
 #define cv_hal_split16u hal_ni_split16u
 #define cv_hal_split32s hal_ni_split32s
 #define cv_hal_split64s hal_ni_split64s
+//! @endcond
 
-inline int hal_ni_merge8u(const uchar**, uchar*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_merge16u(const ushort**, ushort*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_merge32s(const int**, int*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_merge64s(const int64**, int64*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param src_data array of pointers to source arrays (__cn__ items x __len__ items) [ [B, B, ...], [G, G, ...], [R, R, ...] ]
+@param dst_data destination array of interleaved values (__len__ x __cn__ items) [ B, G, R, B, G, R, ...]
+@param len number of elements
+@param cn number of channels
+ */
+//! @addtogroup core_hal_interface_merge Channel merge
+//! @{
+inline int hal_ni_merge8u(const uchar **src_data, uchar *dst_data, int len, int cn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_merge16u(const ushort **src_data, ushort *dst_data, int len, int cn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_merge32s(const int **src_data, int *dst_data, int len, int cn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int cn) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
 
+//! @cond IGNORED
 #define cv_hal_merge8u hal_ni_merge8u
 #define cv_hal_merge16u hal_ni_merge16u
 #define cv_hal_merge32s hal_ni_merge32s
 #define cv_hal_merge64s hal_ni_merge64s
+//! @endcond
+
+//! @}
+
+#if defined __GNUC__
+#  pragma GCC diagnostic pop
+#elif defined _MSC_VER
+#  pragma warning( pop )
+#endif
 
 #include "custom_hal.hpp"
 
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index e1f9348b1..6d9ffb490 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -213,6 +213,11 @@ location of points on the plane, building special graphs (such as NNG,RNG), and
     @defgroup imgproc_feature Feature Detection
     @defgroup imgproc_object Object Detection
     @defgroup imgproc_c C API
+    @defgroup imgproc_hal Hardware Acceleration Layer
+    @{
+        @defgroup imgproc_hal_functions Functions
+        @defgroup imgproc_hal_interface Interface
+    @}
   @}
 */
 
diff --git a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
index eca34e7e7..6ed492bcb 100644
--- a/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
+++ b/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp
@@ -7,7 +7,7 @@
 
 namespace cv { namespace hal {
 
-//! @addtogroup core_hal_functions
+//! @addtogroup imgproc_hal_functions
 //! @{
 
 struct CV_EXPORTS Filter2D
@@ -45,9 +45,9 @@ struct CV_EXPORTS SepFilter2D
 };
 
 
-struct  CV_EXPORTS MorphContext
+struct  CV_EXPORTS Morph
 {
-    static Ptr<MorphContext> create(int op, int src_type, int dst_type, int max_width, int max_height,
+    static Ptr<Morph> create(int op, int src_type, int dst_type, int max_width, int max_height,
                                     int kernel_type, uchar * kernel_data, size_t kernel_step,
                                     int kernel_width, int kernel_height,
                                     int anchor_x, int anchor_y,
@@ -56,10 +56,25 @@ struct  CV_EXPORTS MorphContext
     virtual void apply(uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height,
                        int roi_width, int roi_height, int roi_x, int roi_y,
                        int roi_width2, int roi_height2, int roi_x2, int roi_y2) = 0;
-    virtual ~MorphContext() {}
+    virtual ~Morph() {}
 };
 
 
+CV_EXPORTS void resize(int src_type,
+                       const uchar * src_data, size_t src_step, int src_width, int src_height,
+                       uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                       double inv_scale_x, double inv_scale_y, int interpolation);
+
+CV_EXPORTS void warpAffine(int src_type,
+                           const uchar * src_data, size_t src_step, int src_width, int src_height,
+                           uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                           const double M[6], int interpolation, int borderType, const double borderValue[4]);
+
+CV_EXPORTS void warpPerspectve(int src_type,
+                               const uchar * src_data, size_t src_step, int src_width, int src_height,
+                               uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                               const double M[9], int interpolation, int borderType, const double borderValue[4]);
+
 //! @}
 
 }}
diff --git a/modules/imgproc/include/opencv2/imgproc/hal/interface.h b/modules/imgproc/include/opencv2/imgproc/hal/interface.h
new file mode 100644
index 000000000..9d2a3e5d5
--- /dev/null
+++ b/modules/imgproc/include/opencv2/imgproc/hal/interface.h
@@ -0,0 +1,26 @@
+#ifndef OPENCV_IMGPROC_HAL_INTERFACE_H
+#define OPENCV_IMGPROC_HAL_INTERFACE_H
+
+//! @addtogroup imgproc_hal_interface
+//! @{
+
+//! @name Interpolation modes
+//! @sa cv::InterpolationFlags
+//! @{
+#define CV_HAL_INTER_NEAREST 0
+#define CV_HAL_INTER_LINEAR 1
+#define CV_HAL_INTER_CUBIC 2
+#define CV_HAL_INTER_AREA 3
+#define CV_HAL_INTER_LANCZOS4 4
+//! @}
+
+//! @name Morphology operations
+//! @sa cv::MorphTypes
+//! @{
+#define MORPH_ERODE 0
+#define MORPH_DILATE 1
+//! @}
+
+//! @}
+
+#endif
diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
index e043c4027..2b681f6cf 100644
--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@@ -1,34 +1,312 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
 #ifndef OPENCV_IMGPROC_HAL_REPLACEMENT_HPP
 #define OPENCV_IMGPROC_HAL_REPLACEMENT_HPP
 
 #include "opencv2/core/hal/interface.h"
 
+#if defined __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-parameter"
+#elif defined _MSC_VER
+#  pragma warning( push )
+#  pragma warning( disable: 4100 )
+#endif
+
+//! @addtogroup imgproc_hal_interface
+//! @note Define your functions to override default implementations:
+//! @code
+//! #undef hal_add8u
+//! #define hal_add8u my_add8u
+//! @endcode
+//! @{
+
+/**
+@brief Dummy structure storing filtering context
+
+Users can convert this pointer to any type they want. Initialisation and destruction should be made in Init and Free function implementations correspondingly.
+Example:
+@code{.cpp}
+int my_hal_filterInit(cvhalFilter2D **context, ...) {
+    context = static_cast<cvhalFilter2D*>(new MyFilterData());
+    //... init
+}
+
+int my_hal_filterFree(cvhalFilter2D *context) {
+    MyFilterData *c = static_cast<MyFilterData*>(context);
+    delete c;
+}
+@endcode
+ */
 struct cvhalFilter2D {};
 
-inline int hal_ni_filterInit(cvhalFilter2D **, uchar *, size_t, int, int, int, int, int, int, int, int, double, int, int, bool, bool) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_filter(cvhalFilter2D *, uchar *, size_t, uchar *, size_t, int, int, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_filterFree(cvhalFilter2D *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_filterInit
+   @param context double pointer to user-defined context
+   @param kernel_data pointer to kernel data
+   @param kernel_step kernel step
+   @param kernel_type kernel type (CV_8U, ...)
+   @param kernel_width kernel width
+   @param kernel_height kernel height
+   @param max_width max possible image width, can be used to allocate working buffers
+   @param max_height max possible image height
+   @param src_type source image type
+   @param dst_type destination image type
+   @param borderType border processing mode (CV_HAL_BORDER_REFLECT, ...)
+   @param delta added to pixel values
+   @param anchor_x relative X position of center point within the kernel
+   @param anchor_y relative Y position of center point within the kernel
+   @param allowSubmatrix indicates whether the submatrices will be allowed as source image
+   @param allowInplace indicates whether the inplace operation will be possible
+   @sa cv::filter2D, cv::hal::Filter2D
+ */
+inline int hal_ni_filterInit(cvhalFilter2D **context, uchar *kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int max_width, int max_height, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool allowSubmatrix, bool allowInplace) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_filter
+   @param context pointer to user-defined context
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width images width
+   @param height images height
+   @param full_width full width of source image (outside the ROI)
+   @param full_height full height of source image (outside the ROI)
+   @param offset_x source image ROI offset X
+   @param offset_y source image ROI offset Y
+   @sa cv::filter2D, cv::hal::Filter2D
+ */
+inline int hal_ni_filter(cvhalFilter2D *context, uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_filterFree
+   @param context pointer to user-defined context
+   @sa cv::filter2D, cv::hal::Filter2D
+ */
+inline int hal_ni_filterFree(cvhalFilter2D *context) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+//! @cond IGNORED
 #define cv_hal_filterInit hal_ni_filterInit
 #define cv_hal_filter hal_ni_filter
 #define cv_hal_filterFree hal_ni_filterFree
+//! @endcond
 
-inline int hal_ni_sepFilterInit(cvhalFilter2D **, int, int, int, uchar *, size_t, int, int, uchar *, size_t, int, int, int, int, double, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_sepFilter(cvhalFilter2D *, uchar *, size_t, uchar*, size_t, int, int, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_sepFilterFree(cvhalFilter2D *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_sepFilterInit
+   @param context double pointer to user-defined context
+   @param src_type source image type
+   @param dst_type destination image type
+   @param kernel_type kernels type
+   @param kernelx_data pointer to x-kernel data
+   @param kernelx_step x-kernel step
+   @param kernelx_width x-kernel width
+   @param kernelx_height x-kernel height
+   @param kernely_data pointer to y-kernel data
+   @param kernely_step y-kernel step
+   @param kernely_width y-kernel width
+   @param kernely_height y-kernel height
+   @param anchor_x relative X position of center point within the kernel
+   @param anchor_y relative Y position of center point within the kernel
+   @param delta added to pixel values
+   @param borderType border processing mode (CV_HAL_BORDER_REFLECT, ...)
+   @sa cv::sepFilter2D, cv::hal::SepFilter2D
+ */
+inline int hal_ni_sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar *kernelx_data, size_t kernelx_step, int kernelx_width, int kernelx_height, uchar *kernely_data, size_t kernely_step, int kernely_width, int kernely_height, int anchor_x, int anchor_y, double delta, int borderType) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_sepFilter
+   @param context pointer to user-defined context
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width images width
+   @param height images height
+   @param full_width full width of source image (outside the ROI)
+   @param full_height full height of source image (outside the ROI)
+   @param offset_x source image ROI offset X
+   @param offset_y source image ROI offset Y
+   @sa cv::sepFilter2D, cv::hal::SepFilter2D
+ */
+inline int hal_ni_sepFilter(cvhalFilter2D *context, uchar *src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_sepFilterFree
+   @param context pointer to user-defined context
+   @sa cv::sepFilter2D, cv::hal::SepFilter2D
+ */
+inline int hal_ni_sepFilterFree(cvhalFilter2D *context) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+//! @cond IGNORED
 #define cv_hal_sepFilterInit hal_ni_sepFilterInit
 #define cv_hal_sepFilter hal_ni_sepFilter
 #define cv_hal_sepFilterFree hal_ni_sepFilterFree
+//! @endcond
 
-inline int hal_ni_morphInit(cvhalFilter2D **, int, int, int, int, int, int, uchar *, size_t, int, int, int, int, int, const double[4], int, bool, bool) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_morph(cvhalFilter2D *, uchar *, size_t, uchar *, size_t, int, int, int, int, int, int, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_morphFree(cvhalFilter2D *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_morphInit
+   @param context double pointer to user-defined context
+   @param operation morphology operation CV_HAL_MORPH_ERODE or CV_HAL_MORPH_DILATE
+   @param src_type source image type
+   @param dst_type destination image type
+   @param max_width max possible image width, can be used to allocate working buffers
+   @param max_height max possible image height
+   @param kernel_type kernel type (CV_8U, ...)
+   @param kernel_data pointer to kernel data
+   @param kernel_step kernel step
+   @param kernel_width kernel width
+   @param kernel_height kernel height
+   @param anchor_x relative X position of center point within the kernel
+   @param anchor_y relative Y position of center point within the kernel
+   @param borderType border processing mode (CV_HAL_BORDER_REFLECT, ...)
+   @param borderValue values to use for CV_HAL_BORDER_CONSTANT mode
+   @param iterations number of iterations
+   @param allowSubmatrix indicates whether the submatrices will be allowed as source image
+   @param allowInplace indicates whether the inplace operation will be possible
+   @sa cv::erode, cv::dilate, cv::morphologyEx, cv::hal::Morph
+ */
+inline int hal_ni_morphInit(cvhalFilter2D **context, int operation, int src_type, int dst_type, int max_width, int max_height, int kernel_type, uchar *kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool allowSubmatrix, bool allowInplace) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_morph
+   @param context pointer to user-defined context
+   @param src_data source image data
+   @param src_step source image step
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param width images width
+   @param height images height
+   @param src_full_width full width of source image (outside the ROI)
+   @param src_full_height full height of source image (outside the ROI)
+   @param src_roi_x source image ROI X offset
+   @param src_roi_y source image ROI Y offset
+   @param dst_full_width full width of destination image
+   @param dst_full_height full height of destination image
+   @param dst_roi_x destination image ROI X offset
+   @param dst_roi_y destination image ROI Y offset
+   @sa cv::erode, cv::dilate, cv::morphologyEx, cv::hal::Morph
+ */
+inline int hal_ni_morph(cvhalFilter2D *context, uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int dst_full_width, int dst_full_height, int dst_roi_x, int dst_roi_y) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_morphFree
+   @param context pointer to user-defined context
+   @sa cv::erode, cv::dilate, cv::morphologyEx, cv::hal::Morph
+ */
+inline int hal_ni_morphFree(cvhalFilter2D *context) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+//! @cond IGNORED
 #define cv_hal_morphInit hal_ni_morphInit
 #define cv_hal_morph hal_ni_morph
 #define cv_hal_morphFree hal_ni_morphFree
+//! @endcond
+
+/**
+   @brief hal_resize
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source image width
+   @param src_height source image height
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param inv_scale_x inversed scale X coefficient
+   @param inv_scale_y inversed scale Y coefficient
+   @param interpolation interpolation mode (CV_HAL_INTER_NEAREST, ...)
+   @sa cv::resize, cv::hal::resize
+ */
+inline int hal_ni_resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpAffine
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source image width
+   @param src_height source image height
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param M 3x2 matrix with transform coefficients
+   @param interpolation interpolation mode (CV_HAL_INTER_NEAREST, ...)
+   @param borderType border processing mode (CV_HAL_BORDER_REFLECT, ...)
+   @param borderValue values to use for CV_HAL_BORDER_CONSTANT mode
+   @sa cv::warpAffine, cv::hal::warpAffine
+ */
+inline int hal_ni_warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_warpPerspectve
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source image width
+   @param src_height source image height
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param M 3x3 matrix with transform coefficients
+   @param interpolation interpolation mode (CV_HAL_INTER_NEAREST, ...)
+   @param borderType border processing mode (CV_HAL_BORDER_REFLECT, ...)
+   @param borderValue values to use for CV_HAL_BORDER_CONSTANT mode
+   @sa cv::warpPerspective, cv::hal::warpPerspective
+ */
+inline int hal_ni_warpPerspectve(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_resize hal_ni_resize
+#define cv_hal_warpAffine hal_ni_warpAffine
+#define cv_hal_warpPerspective hal_ni_warpPerspectve
+//! @endcond
+
+//! @}
+
+#if defined __GNUC__
+#  pragma GCC diagnostic pop
+#elif defined _MSC_VER
+#  pragma warning( pop )
+#endif
+
 
 #include "custom_hal.hpp"
 
-#endif // OPENCV_IMGPROC_HAL_REPLACEMENT_HPP
+#endif
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index a7f8eee44..d346965ed 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -49,6 +49,7 @@
 
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
+#include "hal_replacement.hpp"
 
 using namespace cv;
 
@@ -3091,8 +3092,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
 #endif
 
 #if IPP_VERSION_X100 >= 710
-static bool ipp_resize_mt(    Mat src, Mat dst,
-                        double inv_scale_x, double inv_scale_y, int interpolation)
+static bool ipp_resize_mt(Mat & src, Mat & dst,
+                          double inv_scale_x, double inv_scale_y, int interpolation)
 {
     int mode = -1;
     if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
@@ -3113,15 +3114,24 @@ static bool ipp_resize_mt(    Mat src, Mat dst,
 }
 #endif
 
-}
+//==================================================================================================
 
+namespace hal {
 
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
-                 double inv_scale_x, double inv_scale_y, int interpolation )
+void resize(int src_type,
+            const uchar * src_data, size_t src_step, int src_width, int src_height,
+            uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+            double inv_scale_x, double inv_scale_y, int interpolation)
 {
+    CV_Assert((dst_width * dst_height > 0) || (inv_scale_x > 0 && inv_scale_y > 0));
+    if (inv_scale_x < DBL_EPSILON || inv_scale_y < DBL_EPSILON)
+    {
+        inv_scale_x = static_cast<double>(dst_width) / src_width;
+        inv_scale_y = static_cast<double>(dst_height) / src_height;
+    }
+
+    CALL_HAL(resize, cv_hal_resize, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation);
+
     static ResizeFunc linear_tab[] =
     {
         resizeGeneric_<
@@ -3226,24 +3236,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
         resizeArea_<double, double>, 0
     };
 
-    Size ssize = _src.size();
-
-    CV_Assert( ssize.area() > 0 );
-    CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
-    if( dsize.area() == 0 )
-    {
-        dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
-                     saturate_cast<int>(ssize.height*inv_scale_y));
-        CV_Assert( dsize.area() > 0 );
-    }
-    else
-    {
-        inv_scale_x = (double)dsize.width/ssize.width;
-        inv_scale_y = (double)dsize.height/ssize.height;
-    }
-
-
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    int depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
 
     int iscale_x = saturate_cast<int>(scale_x);
@@ -3252,42 +3245,30 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
     bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
             std::abs(scale_y - iscale_y) < DBL_EPSILON;
 
+    Size dsize = Size(saturate_cast<int>(src_width*inv_scale_x),
+                      saturate_cast<int>(src_height*inv_scale_y));
+    CV_Assert( dsize.area() > 0 );
 
-    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
-               ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
-
-    Mat src = _src.getMat();
-    _dst.create(dsize, src.type());
-    Mat dst = _dst.getMat();
-
-    if (dsize == ssize) {
-      // Source and destination are of same size. Use simple copy.
-      src.copyTo(dst);
-      return;
-    }
-
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if (tegra::useTegra() && tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
-        return;
-#endif
+    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat dst(dsize, src_type, dst_data, dst_step);
 
 #ifdef HAVE_IPP
     int mode = -1;
-    if (interpolation == INTER_LINEAR && _src.rows() >= 2 && _src.cols() >= 2)
+    if (interpolation == INTER_LINEAR && src_height >= 2 && src_width >= 2)
         mode = INTER_LINEAR;
-    else if (interpolation == INTER_CUBIC && _src.rows() >= 4 && _src.cols() >= 4)
+    else if (interpolation == INTER_CUBIC && src_height >= 4 && src_width >= 4)
         mode = INTER_CUBIC;
 
     const double IPP_RESIZE_EPS = 1e-10;
-    double ex = fabs((double)dsize.width / _src.cols()  - inv_scale_x) / inv_scale_x;
-    double ey = fabs((double)dsize.height / _src.rows() - inv_scale_y) / inv_scale_y;
+    double ex = fabs((double)dsize.width / src_width  - inv_scale_x) / inv_scale_x;
+    double ey = fabs((double)dsize.height / src_height - inv_scale_y) / inv_scale_y;
 #endif
     CV_IPP_RUN(IPP_VERSION_X100 >= 710 && ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
         (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
         !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U) &&
         mode >= 0 && (cn == 1 || cn == 3 || cn == 4) && (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
-        (depth == CV_64F && mode == INTER_LINEAR)), ipp_resize_mt(src, dst, inv_scale_x, inv_scale_y, interpolation))
-
+        (depth == CV_64F && mode == INTER_LINEAR)),
+        ipp_resize_mt(src, dst, inv_scale_x, inv_scale_y, interpolation))
 
     if( interpolation == INTER_NEAREST )
     {
@@ -3311,7 +3292,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
             if( is_area_fast )
             {
                 int area = iscale_x*iscale_y;
-                size_t srcstep = src.step / src.elemSize1();
+                size_t srcstep = src_step / src.elemSize1();
                 AutoBuffer<int> _ofs(area + dsize.width*cn);
                 int* ofs = _ofs;
                 int* xofs = ofs + area;
@@ -3337,11 +3318,11 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
             ResizeAreaFunc func = area_tab[depth];
             CV_Assert( func != 0 && cn <= 4 );
 
-            AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
-            DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;
+            AutoBuffer<DecimateAlpha> _xytab((src_width + src_height)*2);
+            DecimateAlpha* xtab = _xytab, *ytab = xtab + src_width*2;
 
-            int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
-            int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);
+            int xtab_size = computeResizeAreaTab(src_width, dsize.width, cn, scale_x, xtab);
+            int ytab_size = computeResizeAreaTab(src_height, dsize.height, 1, scale_y, ytab);
 
             AutoBuffer<int> _tabofs(dsize.height + 1);
             int* tabofs = _tabofs;
@@ -3409,11 +3390,11 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
                 fx = 0, sx = 0;
         }
 
-        if( sx + ksize2 >= ssize.width )
+        if( sx + ksize2 >= src_width )
         {
             xmax = std::min( xmax, dx );
-            if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
-                fx = 0, sx = ssize.width-1;
+            if( sx >= src_width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
+                fx = 0, sx = src_width-1;
         }
 
         for( k = 0, sx *= cn; k < cn; k++ )
@@ -3486,6 +3467,46 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
           fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
 }
 
+} // cv::hal::
+} // cv::
+
+//==================================================================================================
+
+void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
+                 double inv_scale_x, double inv_scale_y, int interpolation )
+{
+    Size ssize = _src.size();
+
+    CV_Assert( ssize.area() > 0 );
+    CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
+    if( dsize.area() == 0 )
+    {
+        dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
+                     saturate_cast<int>(ssize.height*inv_scale_y));
+        CV_Assert( dsize.area() > 0 );
+    }
+    else
+    {
+        inv_scale_x = (double)dsize.width/ssize.width;
+        inv_scale_y = (double)dsize.height/ssize.height;
+    }
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
+               ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
+
+    Mat src = _src.getMat();
+    _dst.create(dsize, src.type());
+    Mat dst = _dst.getMat();
+
+    if (dsize == ssize) {
+      // Source and destination are of same size. Use simple copy.
+      src.copyTo(dst);
+      return;
+    }
+
+    hal::resize(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, inv_scale_x, inv_scale_y, interpolation);
+}
+
 
 /****************************************************************************************\
 *                       General warping (affine, perspective, remap)                     *
@@ -5232,7 +5253,7 @@ class WarpAffineInvoker :
 {
 public:
     WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
-                      const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
+                      const Scalar &_borderValue, int *_adelta, int *_bdelta, const double *_M) :
         ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
         borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
         M(_M)
@@ -5410,7 +5431,7 @@ private:
     int interpolation, borderType;
     Scalar borderValue;
     int *adelta, *bdelta;
-    double *M;
+    const double *M;
 };
 
 
@@ -5569,8 +5590,40 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
 
 #endif
 
+namespace hal {
+
+void warpAffine(int src_type,
+                const uchar * src_data, size_t src_step, int src_width, int src_height,
+                uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                const double M[6], int interpolation, int borderType, const double borderValue[4])
+{
+    CALL_HAL(warpAffine, cv_hal_warpAffine, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
+
+    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+
+    int x;
+    AutoBuffer<int> _abdelta(dst.cols*2);
+    int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
+    const int AB_BITS = MAX(10, (int)INTER_BITS);
+    const int AB_SCALE = 1 << AB_BITS;
+
+    for( x = 0; x < dst.cols; x++ )
+    {
+        adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
+        bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
+    }
+
+    Range range(0, dst.rows);
+    WarpAffineInvoker invoker(src, dst, interpolation, borderType,
+                              Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
+                              adelta, bdelta, M);
+    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 }
 
+} // hal::
+} // cv::
+
 
 void cv::warpAffine( InputArray _src, OutputArray _dst,
                      InputArray _M0, Size dsize,
@@ -5596,11 +5649,6 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
     M0.convertTo(matM, matM.type());
 
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if( tegra::useTegra() && tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
-        return;
-#endif
-
     if( !(flags & WARP_INVERSE_MAP) )
     {
         double D = M[0]*M[4] - M[1]*M[3];
@@ -5613,12 +5661,6 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
         M[2] = b1; M[5] = b2;
     }
 
-    int x;
-    AutoBuffer<int> _abdelta(dst.cols*2);
-    int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
-    const int AB_BITS = MAX(10, (int)INTER_BITS);
-    const int AB_SCALE = 1 << AB_BITS;
-
 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
     CV_IPP_CHECK()
     {
@@ -5683,16 +5725,8 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
     }
 #endif
 
-    for( x = 0; x < dst.cols; x++ )
-    {
-        adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
-        bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
-    }
-
-    Range range(0, dst.rows);
-    WarpAffineInvoker invoker(src, dst, interpolation, borderType,
-                              borderValue, adelta, bdelta, M);
-    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
+    hal::warpAffine(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
+                    M, interpolation, borderType, borderValue.val);
 }
 
 
@@ -5703,7 +5737,7 @@ class WarpPerspectiveInvoker :
     public ParallelLoopBody
 {
 public:
-    WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
+    WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, const double *_M, int _interpolation,
                            int _borderType, const Scalar &_borderValue) :
         ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
         borderType(_borderType), borderValue(_borderValue)
@@ -6037,12 +6071,11 @@ public:
 private:
     Mat src;
     Mat dst;
-    double* M;
+    const double* M;
     int interpolation, borderType;
     Scalar borderValue;
 };
 
-
 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
 class IPPWarpPerspectiveInvoker :
     public ParallelLoopBody
@@ -6095,8 +6128,26 @@ private:
     const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
 };
 #endif
+
+namespace hal {
+
+void warpPerspectve(int src_type,
+                    const uchar * src_data, size_t src_step, int src_width, int src_height,
+                    uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                    const double M[9], int interpolation, int borderType, const double borderValue[4])
+{
+    CALL_HAL(warpPerspective, cv_hal_warpPerspective, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
+    Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
+    Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
+
+    Range range(0, dst.rows);
+    WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]));
+    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
 }
 
+} // hal::
+} // cv::
+
 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
                           Size dsize, int flags, int borderType, const Scalar& borderValue )
 {
@@ -6122,12 +6173,6 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
     M0.convertTo(matM, matM.type());
 
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if( tegra::useTegra() && tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
-        return;
-#endif
-
-
 #if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && IPP_DISABLE_BLOCK
     CV_IPP_CHECK()
     {
@@ -6190,9 +6235,8 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
     if( !(flags & WARP_INVERSE_MAP) )
         invert(matM, matM);
 
-    Range range(0, dst.rows);
-    WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
-    parallel_for_(range, invoker, dst.total()/(double)(1<<16));
+    hal::warpPerspectve(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
+                        matM.ptr<double>(), interpolation, borderType, borderValue.val);
 }
 
 
diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index d70edd071..3b799803c 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -1079,7 +1079,7 @@ namespace cv
 
 // ===== 1. replacement implementation
 
-struct ReplacementMorphImpl : public hal::MorphContext
+struct ReplacementMorphImpl : public hal::Morph
 {
     cvhalFilter2D * ctx;
     bool isInitialized;
@@ -1184,7 +1184,7 @@ INIT_TRAIT(CV_32FC4, 32f, 32f_C4R, 4, zero[4] = {0})
 
 //--------------------------------------
 
-struct IppMorphBaseImpl : public hal::MorphContext
+struct IppMorphBaseImpl : public hal::Morph
 {
     virtual bool init(int _op, int _src_type, int dst_type, int max_width, int max_height,
               int kernel_type, uchar * kernel_data, size_t kernel_step, int kernel_width, int kernel_height,
@@ -1379,7 +1379,7 @@ static IppMorphBaseImpl * createIppImpl(int type)
 
 // ===== 3. Fallback implementation
 
-struct OcvMorphImpl : public hal::MorphContext
+struct OcvMorphImpl : public hal::Morph
 {
     Ptr<FilterEngine> f;
     int iterations;
@@ -1425,7 +1425,7 @@ struct OcvMorphImpl : public hal::MorphContext
 
 namespace hal {
 
-Ptr<MorphContext> MorphContext ::create(int op, int src_type, int dst_type, int max_width, int max_height,
+Ptr<Morph> Morph ::create(int op, int src_type, int dst_type, int max_width, int max_height,
                                         int kernel_type, uchar * kernel_data, size_t kernel_step, int kernel_width, int kernel_height,
                                         int anchor_x, int anchor_y,
                                         int borderType, const double borderValue[4],
@@ -1438,7 +1438,7 @@ Ptr<MorphContext> MorphContext ::create(int op, int src_type, int dst_type, int
                        anchor_x, anchor_y,
                        borderType, borderValue, iterations, isSubmatrix, allowInplace))
         {
-            return Ptr<MorphContext>(impl);
+            return Ptr<Morph>(impl);
         }
         delete impl;
     }
@@ -1453,7 +1453,7 @@ Ptr<MorphContext> MorphContext ::create(int op, int src_type, int dst_type, int
                         anchor_x, anchor_y,
                         borderType, borderValue, iterations, isSubmatrix, allowInplace))
             {
-                return Ptr<MorphContext>(impl);
+                return Ptr<Morph>(impl);
             }
             delete impl;
         }
@@ -1465,7 +1465,7 @@ Ptr<MorphContext> MorphContext ::create(int op, int src_type, int dst_type, int
                 kernel_type, kernel_data, kernel_step, kernel_width, kernel_height,
                 anchor_x, anchor_y,
                 borderType, borderValue, iterations, isSubmatrix, allowInplace);
-        return Ptr<MorphContext>(impl);
+        return Ptr<Morph>(impl);
     }
 }
 
@@ -1858,7 +1858,7 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
     Size d_wsz(dst.cols, dst.rows);
     dst.locateROI(d_wsz, d_ofs);
 
-    Ptr<hal::MorphContext> ctx = hal::MorphContext::create(op, src.type(), dst.type(), src.cols, src.rows,
+    Ptr<hal::Morph> ctx = hal::Morph::create(op, src.type(), dst.type(), src.cols, src.rows,
                                                            kernel.type(), kernel.data, kernel.step, kernel.cols, kernel.rows,
                                                            anchor.x, anchor.y, borderType, borderValue.val, iterations,
                                                            src.isSubmatrix(), src.data == dst.data);

From 008abd28fd27742d574372acdb5e6839ffb5316e Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Wed, 13 Jan 2016 17:23:57 +0300
Subject: [PATCH 2/7] Extracted HAL interfaces for DFT/DCT, added new test

---
 modules/core/include/opencv2/core/hal/hal.hpp |   19 +
 .../core/include/opencv2/core/hal/interface.h |   10 +
 modules/core/src/dxt.cpp                      | 1923 +++++++++++------
 modules/core/src/hal_replacement.hpp          |   25 +
 modules/core/test/test_dxt.cpp                |   76 +
 modules/imgproc/src/templmatch.cpp            |   36 +-
 6 files changed, 1405 insertions(+), 684 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 64af09ab8..52a5f99b3 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -187,6 +187,25 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
 
+struct DftContext
+{
+    void * impl;
+    bool useReplacement;
+    DftContext() : impl(0), useReplacement(false) {}
+};
+
+CV_EXPORTS void dftInit2D(DftContext & c, int _width, int _height, int _depth, int _src_channels, int _dst_channels, int flags, int _nonzero_rows = 0);
+CV_EXPORTS void dftRun2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
+CV_EXPORTS void dftFree2D(DftContext & c);
+
+CV_EXPORTS void dftInit(DftContext & c, int len, int count, int depth, int flags, bool * useBuffer = 0);
+CV_EXPORTS void dftRun(const DftContext & c, const void * src, void * dst);
+CV_EXPORTS void dftFree(DftContext & c);
+
+CV_EXPORTS void dctInit(DftContext & c, int width, int height, int depth, int flags);
+CV_EXPORTS void dctRun(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
+CV_EXPORTS void dctFree(DftContext & c);
+
 //! @} core_hal
 
 //=============================================================================
diff --git a/modules/core/include/opencv2/core/hal/interface.h b/modules/core/include/opencv2/core/hal/interface.h
index ba3f55c33..0da68f18c 100644
--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -12,6 +12,16 @@
 //! @}
 
 
+#define CV_HAL_DFT_INVERSE        1
+#define CV_HAL_DFT_SCALE          2
+#define CV_HAL_DFT_ROWS           4
+#define CV_HAL_DFT_COMPLEX_OUTPUT 16
+#define CV_HAL_DFT_REAL_OUTPUT    32
+#define CV_HAL_DFT_TWO_STAGE      64
+#define CV_HAL_DFT_STAGE_COLS    128
+#define CV_HAL_DFT_IS_CONTINUOUS 512
+#define CV_HAL_DFT_IS_INPLACE 1024
+
 #ifdef __cplusplus
 #include <cstddef>
 #else
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index 691b29746..1265091bc 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -173,7 +173,7 @@ DFTFactorize( int n, int* factors )
 }
 
 static void
-DFTInit( int n0, int nf, int* factors, int* itab, int elem_size, void* _wave, int inv_itab )
+DFTInit( int n0, int nf, const int* factors, int* itab, int elem_size, void* _wave, int inv_itab )
 {
     int digits[34], radix[34];
     int n = factors[0], m = 0;
@@ -519,19 +519,59 @@ static IppStatus ippsDFTInv_PackToR( const double* src, double* dst,
 }
 #endif
 
-enum { DFT_NO_PERMUTE=256, DFT_COMPLEX_INPUT_OR_OUTPUT=512 };
+struct OcvDftOptions;
+
+typedef void (*DFTFunc)(const OcvDftOptions & c, const void* src, void* dst);
+
+struct OcvDftOptions {
+    int nf;
+    int *factors;
+    double scale;
+
+    int* itab;
+    void* wave;
+    int tab_size;
+    int n;
+
+    bool isInverse;
+    bool noPermute;
+    bool isComplex;
+
+    bool haveSSE3;
+
+    DFTFunc dft_func;
+    bool useIpp;
+
+#ifdef USE_IPP_DFT
+    uchar* ipp_spec;
+    uchar* ipp_work;
+#endif
+
+    OcvDftOptions()
+    {
+        nf = 0;
+        factors = 0;
+        scale = 0;
+        itab = 0;
+        wave = 0;
+        tab_size = 0;
+        n = 0;
+        isInverse = false;
+        noPermute = false;
+        isComplex = false;
+        useIpp = false;
+#ifdef USE_IPP_DFT
+        ipp_spec = 0;
+        ipp_work = 0;
+#endif
+        dft_func = 0;
+        haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
+    }
+};
 
 // mixed-radix complex discrete Fourier transform: double-precision version
 template<typename T> static void
-DFT( const Complex<T>* src, Complex<T>* dst, int n,
-     int nf, const int* factors, const int* itab,
-     const Complex<T>* wave, int tab_size,
-     const void*
-#ifdef USE_IPP_DFT
-     spec
-#endif
-     , Complex<T>* buf,
-     int flags, double _scale )
+DFT(const OcvDftOptions & c, const Complex<T>* src, Complex<T>* dst)
 {
     static const T sin_120 = (T)0.86602540378443864676372317075294;
     static const T fft5_2 = (T)0.559016994374947424102293417182819;
@@ -539,20 +579,23 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
     static const T fft5_4 = (T)-1.538841768587626701285145288018455;
     static const T fft5_5 = (T)0.363271264002680442947733378740309;
 
-    int n0 = n, f_idx, nx;
-    int inv = flags & DFT_INVERSE;
-    int dw0 = tab_size, dw;
+    const Complex<T>* wave = (Complex<T>*)c.wave;
+    const int * itab = c.itab;
+
+    int n = c.n;
+    int f_idx, nx;
+    int inv = c.isInverse;
+    int dw0 = c.tab_size, dw;
     int i, j, k;
     Complex<T> t;
-    T scale = (T)_scale;
-    int tab_step;
+    T scale = (T)c.scale;
 
-#ifdef USE_IPP_DFT
-    if( spec )
+    if( c.useIpp )
     {
+#ifdef USE_IPP_DFT
         if( !inv )
         {
-            if (ippsDFTFwd_CToC( src, dst, spec, (uchar*)buf ) >= 0)
+            if (ippsDFTFwd_CToC( src, dst, c.ipp_spec, c.ipp_work ) >= 0)
             {
                 CV_IMPL_ADD(CV_IMPL_IPP);
                 return;
@@ -560,22 +603,22 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         }
         else
         {
-            if (ippsDFTInv_CToC( src, dst, spec, (uchar*)buf ) >= 0)
+            if (ippsDFTInv_CToC( src, dst, c.ipp_spec, c.ipp_work ) >= 0)
             {
                 CV_IMPL_ADD(CV_IMPL_IPP);
                 return;
             }
         }
         setIppErrorStatus();
-    }
 #endif
+    }
 
-    tab_step = tab_size == n ? 1 : tab_size == n*2 ? 2 : tab_size/n;
+    int tab_step = c.tab_size == n ? 1 : c.tab_size == n*2 ? 2 : c.tab_size/n;
 
     // 0. shuffle data
     if( dst != src )
     {
-        assert( (flags & DFT_NO_PERMUTE) == 0 );
+        assert( !c.noPermute );
         if( !inv )
         {
             for( i = 0; i <= n - 2; i += 2, itab += 2*tab_step )
@@ -609,10 +652,10 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
     }
     else
     {
-        if( (flags & DFT_NO_PERMUTE) == 0 )
+        if( !c.noPermute )
         {
-            CV_Assert( factors[0] == factors[nf-1] );
-            if( nf == 1 )
+            CV_Assert( c.factors[0] == c.factors[c.nf-1] );
+            if( c.nf == 1 )
             {
                 if( (n & 3) == 0 )
                 {
@@ -662,22 +705,22 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
 
     n = 1;
     // 1. power-2 transforms
-    if( (factors[0] & 1) == 0 )
+    if( (c.factors[0] & 1) == 0 )
     {
-        if( factors[0] >= 4 && checkHardwareSupport(CV_CPU_SSE3))
+        if( c.factors[0] >= 4 && c.haveSSE3)
         {
             DFT_VecR4<T> vr4;
-            n = vr4(dst, factors[0], n0, dw0, wave);
+            n = vr4(dst, c.factors[0], c.n, dw0, wave);
         }
 
         // radix-4 transform
-        for( ; n*4 <= factors[0]; )
+        for( ; n*4 <= c.factors[0]; )
         {
             nx = n;
             n *= 4;
             dw0 /= 4;
 
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 Complex<T> *v0, *v1;
                 T r0, i0, r1, i1, r2, i2, r3, i3, r4, i4;
@@ -729,14 +772,14 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
             }
         }
 
-        for( ; n < factors[0]; )
+        for( ; n < c.factors[0]; )
         {
             // do the remaining radix-2 transform
             nx = n;
             n *= 2;
             dw0 /= 2;
 
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 Complex<T>* v = dst + i;
                 T r0 = v[0].re + v[nx].re;
@@ -761,9 +804,9 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
     }
 
     // 2. all the other transforms
-    for( f_idx = (factors[0]&1) ? 0 : 1; f_idx < nf; f_idx++ )
+    for( f_idx = (c.factors[0]&1) ? 0 : 1; f_idx < c.nf; f_idx++ )
     {
-        int factor = factors[f_idx];
+        int factor = c.factors[f_idx];
         nx = n;
         n *= factor;
         dw0 /= factor;
@@ -771,7 +814,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         if( factor == 3 )
         {
             // radix-3
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 Complex<T>* v = dst + i;
 
@@ -807,7 +850,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         else if( factor == 5 )
         {
             // radix-5
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 for( j = 0, dw = 0; j < nx; j++, dw += dw0 )
                 {
@@ -863,11 +906,12 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         {
             // radix-"factor" - an odd number
             int p, q, factor2 = (factor - 1)/2;
-            int d, dd, dw_f = tab_size/factor;
+            int d, dd, dw_f = c.tab_size/factor;
+            AutoBuffer<Complex<T> > buf(factor2 * 2);
             Complex<T>* a = buf;
-            Complex<T>* b = buf + factor2;
+            Complex<T>* b = a + factor2;
 
-            for( i = 0; i < n0; i += n )
+            for( i = 0; i < c.n; i += n )
             {
                 for( j = 0, dw = 0; j < nx; j++, dw += dw0 )
                 {
@@ -931,7 +975,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
                             s1.im += r1 - i1; s0.im += r1 + i1;
 
                             d += dd;
-                            d -= -(d >= tab_size) & tab_size;
+                            d -= -(d >= c.tab_size) & c.tab_size;
                         }
 
                         v[k] = s0;
@@ -948,7 +992,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
         if( inv )
             im_scale = -im_scale;
 
-        for( i = 0; i < n0; i++ )
+        for( i = 0; i < c.n; i++ )
         {
             T t0 = dst[i].re*re_scale;
             T t1 = dst[i].im*im_scale;
@@ -958,7 +1002,7 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
     }
     else if( inv )
     {
-        for( i = 0; i <= n0 - 2; i += 2 )
+        for( i = 0; i <= c.n - 2; i += 2 )
         {
             T t0 = -dst[i].im;
             T t1 = -dst[i+1].im;
@@ -966,8 +1010,8 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
             dst[i+1].im = t1;
         }
 
-        if( i < n0 )
-            dst[n0-1].im = -dst[n0-1].im;
+        if( i < c.n )
+            dst[c.n-1].im = -dst[c.n-1].im;
     }
 }
 
@@ -977,23 +1021,18 @@ DFT( const Complex<T>* src, Complex<T>* dst, int n,
      re(0), re(1), im(1), ... , re(n/2-1), im((n+1)/2-1) [, re((n+1)/2)] OR ...
      re(0), 0, re(1), im(1), ..., re(n/2-1), im((n+1)/2-1) [, re((n+1)/2), 0] */
 template<typename T> static void
-RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
-         const Complex<T>* wave, int tab_size, const void*
-#ifdef USE_IPP_DFT
-         spec
-#endif
-         ,
-         Complex<T>* buf, int flags, double _scale )
+RealDFT(const OcvDftOptions & c, const T* src, T* dst)
 {
-    int complex_output = (flags & DFT_COMPLEX_INPUT_OR_OUTPUT) != 0;
-    T scale = (T)_scale;
-    int j, n2 = n >> 1;
+    int n = c.n;
+    int complex_output = c.isComplex;
+    T scale = (T)c.scale;
+    int j;
     dst += complex_output;
 
-#ifdef USE_IPP_DFT
-    if( spec )
+    if( c.useIpp )
     {
-        if (ippsDFTFwd_RToPack( src, dst, spec, (uchar*)buf ) >=0)
+#ifdef USE_IPP_DFT
+        if (ippsDFTFwd_RToPack( src, dst, c.ipp_spec, c.ipp_work ) >=0)
         {
             if( complex_output )
             {
@@ -1006,9 +1045,9 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
             return;
         }
         setIppErrorStatus();
-    }
 #endif
-    assert( tab_size == n );
+    }
+    assert( c.tab_size == n );
 
     if( n == 1 )
     {
@@ -1028,15 +1067,19 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         _dst[0].im = 0;
         for( j = 1; j < n; j += 2 )
         {
-            T t0 = src[itab[j]]*scale;
-            T t1 = src[itab[j+1]]*scale;
+            T t0 = src[c.itab[j]]*scale;
+            T t1 = src[c.itab[j+1]]*scale;
             _dst[j].re = t0;
             _dst[j].im = 0;
             _dst[j+1].re = t1;
             _dst[j+1].im = 0;
         }
-        DFT( _dst, _dst, n, nf, factors, itab, wave,
-             tab_size, 0, buf, DFT_NO_PERMUTE, 1 );
+        OcvDftOptions sub_c = c;
+        sub_c.isComplex = false;
+        sub_c.isInverse = false;
+        sub_c.noPermute = true;
+        sub_c.scale = 1.;
+        DFT(sub_c, _dst, _dst);
         if( !complex_output )
             dst[1] = dst[0];
     }
@@ -1045,12 +1088,22 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         T t0, t;
         T h1_re, h1_im, h2_re, h2_im;
         T scale2 = scale*(T)0.5;
-        factors[0] >>= 1;
+        int n2 = n >> 1;
 
-        DFT( (Complex<T>*)src, (Complex<T>*)dst, n2, nf - (factors[0] == 1),
-             factors + (factors[0] == 1),
-             itab, wave, tab_size, 0, buf, 0, 1 );
-        factors[0] <<= 1;
+        c.factors[0] >>= 1;
+
+        OcvDftOptions sub_c = c;
+        sub_c.factors += (c.factors[0] == 1);
+        sub_c.nf -= (c.factors[0] == 1);
+        sub_c.isComplex = false;
+        sub_c.isInverse = false;
+        sub_c.noPermute = false;
+        sub_c.scale = 1.;
+        sub_c.n = n2;
+
+        DFT(sub_c, (Complex<T>*)src, (Complex<T>*)dst);
+
+        c.factors[0] <<= 1;
 
         t = dst[0] - dst[1];
         dst[0] = (dst[0] + dst[1])*scale;
@@ -1060,6 +1113,8 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         t = dst[n-1];
         dst[n-1] = dst[1];
 
+        const Complex<T> *wave = (const Complex<T>*)c.wave;
+
         for( j = 2, wave++; j < n2; j += 2, wave++ )
         {
             /* calc odd */
@@ -1103,22 +1158,16 @@ RealDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
       re[0], re[1], im[1], ... , re[n/2-1], im[n/2-1], re[n/2] OR
       re(0), 0, re(1), im(1), ..., re(n/2-1), im((n+1)/2-1) [, re((n+1)/2), 0] */
 template<typename T> static void
-CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
-         const Complex<T>* wave, int tab_size,
-         const void*
-#ifdef USE_IPP_DFT
-         spec
-#endif
-         , Complex<T>* buf,
-         int flags, double _scale )
+CCSIDFT(const OcvDftOptions & c, const T* src, T* dst)
 {
-    int complex_input = (flags & DFT_COMPLEX_INPUT_OR_OUTPUT) != 0;
-    int j, k, n2 = (n+1) >> 1;
-    T scale = (T)_scale;
+    int n = c.n;
+    int complex_input = c.isComplex;
+    int j, k;
+    T scale = (T)c.scale;
     T save_s1 = 0.;
     T t0, t1, t2, t3, t;
 
-    assert( tab_size == n );
+    assert( c.tab_size == n );
 
     if( complex_input )
     {
@@ -1127,10 +1176,10 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         ((T*)src)[1] = src[0];
         src++;
     }
-#ifdef USE_IPP_DFT
-    if( spec )
+    if( c.useIpp )
     {
-        if (ippsDFTInv_PackToR( src, dst, spec, (uchar*)buf ) >=0)
+#ifdef USE_IPP_DFT
+        if (ippsDFTInv_PackToR( src, dst, c.ipp_spec, c.ipp_work ) >=0)
         {
             if( complex_input )
                 ((T*)src)[0] = (T)save_s1;
@@ -1139,8 +1188,8 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         }
 
         setIppErrorStatus();
-    }
 #endif
+    }
     if( n == 1 )
     {
         dst[0] = (T)(src[0]*scale);
@@ -1158,16 +1207,25 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
 
         _dst[0].re = src[0];
         _dst[0].im = 0;
+
+        int n2 = (n+1) >> 1;
+
         for( j = 1; j < n2; j++ )
         {
-            int k0 = itab[j], k1 = itab[n-j];
+            int k0 = c.itab[j], k1 = c.itab[n-j];
             t0 = _src[j].re; t1 = _src[j].im;
             _dst[k0].re = t0; _dst[k0].im = -t1;
             _dst[k1].re = t0; _dst[k1].im = t1;
         }
 
-        DFT( _dst, _dst, n, nf, factors, itab, wave,
-             tab_size, 0, buf, DFT_NO_PERMUTE, 1. );
+        OcvDftOptions sub_c = c;
+        sub_c.isComplex = false;
+        sub_c.isInverse = false;
+        sub_c.noPermute = true;
+        sub_c.scale = 1.;
+        sub_c.n = n;
+
+        DFT(sub_c, _dst, _dst);
         dst[0] *= scale;
         for( j = 1; j < n; j += 2 )
         {
@@ -1180,7 +1238,7 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
     else
     {
         int inplace = src == dst;
-        const Complex<T>* w = wave;
+        const Complex<T>* w = (const Complex<T>*)c.wave;
 
         t = src[1];
         t0 = (src[0] + src[n-1]);
@@ -1188,6 +1246,8 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
         dst[0] = t0;
         dst[1] = t1;
 
+        int n2 = (n+1) >> 1;
+
         for( j = 2, w++; j < n2; j += 2, w++ )
         {
             T h1_re, h1_im, h2_re, h2_im;
@@ -1218,10 +1278,10 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
             else
             {
                 int j2 = j >> 1;
-                k = itab[j2];
+                k = c.itab[j2];
                 dst[k] = t0;
                 dst[k+1] = t1;
-                k = itab[n2-j2];
+                k = c.itab[n2-j2];
                 dst[k] = t2;
                 dst[k+1]= t3;
             }
@@ -1239,19 +1299,26 @@ CCSIDFT( const T* src, T* dst, int n, int nf, int* factors, const int* itab,
             }
             else
             {
-                k = itab[n2];
+                k = c.itab[n2];
                 dst[k*2] = t0;
                 dst[k*2+1] = t1;
             }
         }
 
-        factors[0] >>= 1;
-        DFT( (Complex<T>*)dst, (Complex<T>*)dst, n2,
-             nf - (factors[0] == 1),
-             factors + (factors[0] == 1), itab,
-             wave, tab_size, 0, buf,
-             inplace ? 0 : DFT_NO_PERMUTE, 1. );
-        factors[0] <<= 1;
+        c.factors[0] >>= 1;
+
+        OcvDftOptions sub_c = c;
+        sub_c.factors += (c.factors[0] == 1);
+        sub_c.nf -= (c.factors[0] == 1);
+        sub_c.isComplex = false;
+        sub_c.isInverse = false;
+        sub_c.noPermute = !inplace;
+        sub_c.scale = 1.;
+        sub_c.n = n2;
+
+        DFT(sub_c, (Complex<T>*)dst, (Complex<T>*)dst);
+
+        c.factors[0] <<= 1;
 
         for( j = 0; j < n; j += 2 )
         {
@@ -1436,57 +1503,35 @@ ExpandCCS( uchar* _ptr, int n, int elem_size )
     }
 }
 
-
-typedef void (*DFTFunc)(
-     const void* src, void* dst, int n, int nf, int* factors,
-     const int* itab, const void* wave, int tab_size,
-     const void* spec, void* buf, int inv, double scale );
-
-static void DFT_32f( const Complexf* src, Complexf* dst, int n,
-    int nf, const int* factors, const int* itab,
-    const Complexf* wave, int tab_size,
-    const void* spec, Complexf* buf,
-    int flags, double scale )
+static void DFT_32f(const OcvDftOptions & c, const Complexf* src, Complexf* dst)
 {
-    DFT(src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    DFT(c, src, dst);
 }
 
-static void DFT_64f( const Complexd* src, Complexd* dst, int n,
-    int nf, const int* factors, const int* itab,
-    const Complexd* wave, int tab_size,
-    const void* spec, Complexd* buf,
-    int flags, double scale )
+static void DFT_64f(const OcvDftOptions & c, const Complexd* src, Complexd* dst)
 {
-    DFT(src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    DFT(c, src, dst);
 }
 
 
-static void RealDFT_32f( const float* src, float* dst, int n, int nf, int* factors,
-        const int* itab,  const Complexf* wave, int tab_size, const void* spec,
-        Complexf* buf, int flags, double scale )
+static void RealDFT_32f(const OcvDftOptions & c, const float* src, float* dst)
 {
-    RealDFT( src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    RealDFT(c, src, dst);
 }
 
-static void RealDFT_64f( const double* src, double* dst, int n, int nf, int* factors,
-        const int* itab,  const Complexd* wave, int tab_size, const void* spec,
-        Complexd* buf, int flags, double scale )
+static void RealDFT_64f(const OcvDftOptions & c, const double* src, double* dst)
 {
-    RealDFT( src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    RealDFT(c, src, dst);
 }
 
-static void CCSIDFT_32f( const float* src, float* dst, int n, int nf, int* factors,
-                         const int* itab,  const Complexf* wave, int tab_size, const void* spec,
-                         Complexf* buf, int flags, double scale )
+static void CCSIDFT_32f(const OcvDftOptions & c, const float* src, float* dst)
 {
-    CCSIDFT( src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    CCSIDFT(c, src, dst);
 }
 
-static void CCSIDFT_64f( const double* src, double* dst, int n, int nf, int* factors,
-                         const int* itab,  const Complexd* wave, int tab_size, const void* spec,
-                         Complexd* buf, int flags, double scale )
+static void CCSIDFT_64f(const OcvDftOptions & c, const double* src, double* dst)
 {
-    CCSIDFT( src, dst, n, nf, factors, itab, wave, tab_size, spec, buf, flags, scale);
+    CCSIDFT(c, src, dst);
 }
 
 }
@@ -1508,8 +1553,11 @@ class Dft_C_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_C_IPPLoop_Invoker(const Mat& _src, Mat& _dst, const Dft& _ippidft, int _norm_flag, bool *_ok) :
-        ParallelLoopBody(), src(_src), dst(_dst), ippidft(_ippidft), norm_flag(_norm_flag), ok(_ok)
+    Dft_C_IPPLoop_Invoker(uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+                          const Dft& _ippidft, int _norm_flag, bool *_ok) :
+        ParallelLoopBody(),
+        src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
+        ippidft(_ippidft), norm_flag(_norm_flag), ok(_ok)
     {
         *ok = true;
     }
@@ -1523,7 +1571,7 @@ public:
         int sizeSpec=0;
         int sizeInit=0;
 
-        IppiSize srcRoiSize = {src.cols, 1};
+        IppiSize srcRoiSize = {width, 1};
 
         status = ippiDFTGetSize_C_32fc(srcRoiSize, norm_flag, ippAlgHintNone, &sizeSpec, &sizeInit, &sizeBuffer );
         if ( status < 0 )
@@ -1555,7 +1603,8 @@ public:
         }
 
         for( int i = range.start; i < range.end; ++i)
-            if(!ippidft(src.ptr<Ipp32fc>(i), (int)src.step,dst.ptr<Ipp32fc>(i), (int)dst.step, pDFTSpec, (Ipp8u*)pBuffer))
+            if(!ippidft((Ipp32fc*)(src + src_step * i), src_step, (Ipp32fc*)(dst + dst_step * i), dst_step,
+                        pDFTSpec, (Ipp8u*)pBuffer))
             {
                 *ok = false;
             }
@@ -1568,8 +1617,11 @@ public:
     }
 
 private:
-    const Mat& src;
-    Mat& dst;
+    uchar * src;
+    int src_step;
+    uchar * dst;
+    int dst_step;
+    int width;
     const Dft& ippidft;
     int norm_flag;
     bool *ok;
@@ -1582,8 +1634,11 @@ class Dft_R_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_R_IPPLoop_Invoker(const Mat& _src, Mat& _dst, const Dft& _ippidft, int _norm_flag, bool *_ok) :
-        ParallelLoopBody(), src(_src), dst(_dst), ippidft(_ippidft), norm_flag(_norm_flag), ok(_ok)
+    Dft_R_IPPLoop_Invoker(uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+                          const Dft& _ippidft, int _norm_flag, bool *_ok) :
+        ParallelLoopBody(),
+        src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
+        ippidft(_ippidft), norm_flag(_norm_flag), ok(_ok)
     {
         *ok = true;
     }
@@ -1597,7 +1652,7 @@ public:
         int sizeSpec=0;
         int sizeInit=0;
 
-        IppiSize srcRoiSize = {src.cols, 1};
+        IppiSize srcRoiSize = {width, 1};
 
         status = ippiDFTGetSize_R_32f(srcRoiSize, norm_flag, ippAlgHintNone, &sizeSpec, &sizeInit, &sizeBuffer );
         if ( status < 0 )
@@ -1629,7 +1684,8 @@ public:
         }
 
         for( int i = range.start; i < range.end; ++i)
-            if(!ippidft(src.ptr<float>(i), (int)src.step,dst.ptr<float>(i), (int)dst.step, pDFTSpec, (Ipp8u*)pBuffer))
+            if(!ippidft((float*)(src + src_step * i), src_step, (float*)(dst + dst_step * i), dst_step,
+                        pDFTSpec, (Ipp8u*)pBuffer))
             {
                 *ok = false;
             }
@@ -1642,8 +1698,11 @@ public:
     }
 
 private:
-    const Mat& src;
-    Mat& dst;
+    uchar * src;
+    int src_step;
+    uchar * dst;
+    int dst_step;
+    int width;
     const Dft& ippidft;
     int norm_flag;
     bool *ok;
@@ -1652,18 +1711,18 @@ private:
 };
 
 template <typename Dft>
-bool Dft_C_IPPLoop(const Mat& src, Mat& dst, const Dft& ippidft, int norm_flag)
+bool Dft_C_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
-    parallel_for_(Range(0, src.rows), Dft_C_IPPLoop_Invoker<Dft>(src, dst, ippidft, norm_flag, &ok), src.total()/(double)(1<<16) );
+    parallel_for_(Range(0, height), Dft_C_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
     return ok;
 }
 
 template <typename Dft>
-bool Dft_R_IPPLoop(const Mat& src, Mat& dst, const Dft& ippidft, int norm_flag)
+bool Dft_R_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
-    parallel_for_(Range(0, src.rows), Dft_R_IPPLoop_Invoker<Dft>(src, dst, ippidft, norm_flag, &ok), src.total()/(double)(1<<16) );
+    parallel_for_(Range(0, height), Dft_R_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
     return ok;
 }
 
@@ -1691,7 +1750,7 @@ private:
     ippiDFT_R_Func func;
 };
 
-static bool ippi_DFT_C_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
+static bool ippi_DFT_C_32F(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -1700,7 +1759,7 @@ static bool ippi_DFT_C_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     int sizeSpec=0;
     int sizeInit=0;
 
-    IppiSize srcRoiSize = {src.cols, src.rows};
+    IppiSize srcRoiSize = {width, height};
 
     status = ippiDFTGetSize_C_32fc(srcRoiSize, norm_flag, ippAlgHintNone, &sizeSpec, &sizeInit, &sizeBuffer );
     if ( status < 0 )
@@ -1728,9 +1787,9 @@ static bool ippi_DFT_C_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     }
 
     if (!inv)
-        status = ippiDFTFwd_CToC_32fc_C1R( src.ptr<Ipp32fc>(), (int)src.step, dst.ptr<Ipp32fc>(), (int)dst.step, pDFTSpec, pBuffer );
+        status = ippiDFTFwd_CToC_32fc_C1R( (Ipp32fc*)src, src_step, (Ipp32fc*)dst, dst_step, pDFTSpec, pBuffer );
     else
-        status = ippiDFTInv_CToC_32fc_C1R( src.ptr<Ipp32fc>(), (int)src.step, dst.ptr<Ipp32fc>(), (int)dst.step, pDFTSpec, pBuffer );
+        status = ippiDFTInv_CToC_32fc_C1R( (Ipp32fc*)src, src_step, (Ipp32fc*)dst, dst_step, pDFTSpec, pBuffer );
 
     if ( sizeBuffer > 0 )
         ippFree( pBuffer );
@@ -1745,7 +1804,7 @@ static bool ippi_DFT_C_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     return false;
 }
 
-static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
+static bool ippi_DFT_R_32F(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -1754,7 +1813,7 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     int sizeSpec=0;
     int sizeInit=0;
 
-    IppiSize srcRoiSize = {src.cols, src.rows};
+    IppiSize srcRoiSize = {width, height};
 
     status = ippiDFTGetSize_R_32f(srcRoiSize, norm_flag, ippAlgHintNone, &sizeSpec, &sizeInit, &sizeBuffer );
     if ( status < 0 )
@@ -1782,9 +1841,9 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
     }
 
     if (!inv)
-        status = ippiDFTFwd_RToPack_32f_C1R( src.ptr<float>(), (int)(src.step), dst.ptr<float>(), (int)dst.step, pDFTSpec, pBuffer );
+        status = ippiDFTFwd_RToPack_32f_C1R( (float*)src, src_step, (float*)dst, dst_step, pDFTSpec, pBuffer );
     else
-        status = ippiDFTInv_PackToR_32f_C1R( src.ptr<float>(), (int)src.step, dst.ptr<float>(), (int)dst.step, pDFTSpec, pBuffer );
+        status = ippiDFTInv_PackToR_32f_C1R( (float*)src, src_step, (float*)dst, dst_step, pDFTSpec, pBuffer );
 
     if ( sizeBuffer > 0 )
         ippFree( pBuffer );
@@ -2426,111 +2485,324 @@ static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
 
 namespace cv
 {
-static void complementComplexOutput(Mat& dst, int len, int dft_dims)
-{
-    int i, n = dst.cols;
-    size_t elem_size = dst.elemSize1();
-    if( elem_size == sizeof(float) )
-    {
-        float* p0 = dst.ptr<float>();
-        size_t dstep = dst.step/sizeof(p0[0]);
-        for( i = 0; i < len; i++ )
-        {
-            float* p = p0 + dstep*i;
-            float* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
 
-            for( int j = 1; j < (n+1)/2; j++ )
-            {
-                p[(n-j)*2] = q[j*2];
-                p[(n-j)*2+1] = -q[j*2+1];
-            }
+template <typename T>
+static void complementComplex(T * ptr, int step, int n, int len, int dft_dims)
+{
+    T* p0 = (T*)ptr;
+    size_t dstep = step/sizeof(p0[0]);
+    for(int i = 0; i < len; i++ )
+    {
+        T* p = p0 + dstep*i;
+        T* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
+
+        for( int j = 1; j < (n+1)/2; j++ )
+        {
+            p[(n-j)*2] = q[j*2];
+            p[(n-j)*2+1] = -q[j*2+1];
         }
     }
+}
+
+static void complementComplexOutput(int depth, uchar * ptr, int step, int count, int len, int dft_dims)
+{
+    if( depth == CV_32F )
+        complementComplex((float*)ptr, step, count, len, dft_dims);
     else
-    {
-        double* p0 = dst.ptr<double>();
-        size_t dstep = dst.step/sizeof(p0[0]);
-        for( i = 0; i < len; i++ )
-        {
-            double* p = p0 + dstep*i;
-            double* q = dft_dims == 1 || i == 0 || i*2 == len ? p : p0 + dstep*(len-i);
-
-            for( int j = 1; j < (n+1)/2; j++ )
-            {
-                p[(n-j)*2] = q[j*2];
-                p[(n-j)*2+1] = -q[j*2+1];
-            }
-        }
-    }
-}
+        complementComplex((double*)ptr, step, count, len, dft_dims);
 }
 
-void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
+enum DftMode {
+    InvalidDft = 0,
+    FwdRealToCCS,
+    FwdRealToComplex,
+    FwdComplexToComplex,
+    InvCCSToReal,
+    InvComplexToReal,
+    InvComplexToComplex,
+};
+
+enum DftDims {
+    InvalidDim = 0,
+    OneDim,
+    OneDimColWise,
+    TwoDims
+};
+
+inline const char * modeName(DftMode m)
 {
-#ifdef HAVE_CLAMDFFT
-    CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
-            _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
-               ocl_dft_amdfft(_src0, _dst, flags))
-#endif
-
-#ifdef HAVE_OPENCL
-    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
-               ocl_dft(_src0, _dst, flags, nonzero_rows))
-#endif
-
-    static DFTFunc dft_tbl[6] =
+    switch (m)
     {
-        (DFTFunc)DFT_32f,
-        (DFTFunc)RealDFT_32f,
-        (DFTFunc)CCSIDFT_32f,
-        (DFTFunc)DFT_64f,
-        (DFTFunc)RealDFT_64f,
-        (DFTFunc)CCSIDFT_64f
+    case InvalidDft: return "InvalidDft";
+    case FwdRealToCCS: return "FwdRealToCCS";
+    case FwdRealToComplex: return "FwdRealToComplex";
+    case FwdComplexToComplex: return "FwdComplexToComplex";
+    case InvCCSToReal: return "InvCCSToReal";
+    case InvComplexToReal: return "InvComplexToReal";
+    case InvComplexToComplex: return "InvComplexToComplex";
+    }
+    return 0;
+}
+
+inline const char * dimsName(DftDims d)
+{
+    switch (d)
+    {
+    case InvalidDim: return "InvalidDim";
+    case OneDim: return "OneDim";
+    case OneDimColWise: return "OneDimColWise";
+    case TwoDims: return "TwoDims";
     };
-    AutoBuffer<uchar> buf;
-    Mat src0 = _src0.getMat(), src = src0;
-    int prev_len = 0, stage = 0;
-    bool inv = (flags & DFT_INVERSE) != 0;
-    int nf = 0, real_transform = src.channels() == 1 || (inv && (flags & DFT_REAL_OUTPUT)!=0);
-    int type = src.type(), depth = src.depth();
-    int elem_size = (int)src.elemSize1(), complex_elem_size = elem_size*2;
-    int factors[34];
-    bool inplace_transform = false;
-#ifdef USE_IPP_DFT
-    AutoBuffer<uchar> ippbuf;
-    int ipp_norm_flag = !(flags & DFT_SCALE) ? 8 : inv ? 2 : 1;
-#endif
+    return 0;
+}
 
-    CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );
+template <typename T>
+inline bool isInv(T mode)
+{
+    switch ((DftMode)mode)
+    {
+        case InvCCSToReal:
+        case InvComplexToReal:
+        case InvComplexToComplex: return true;
+        default: return false;
+    }
+}
 
-    if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
-        _dst.create( src.size(), CV_MAKETYPE(depth, 2) );
-    else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
-        _dst.create( src.size(), depth );
+inline DftMode determineMode(bool inv, int cn1, int cn2)
+{
+    if (!inv)
+    {
+        if (cn1 == 1 && cn2 == 1)
+            return FwdRealToCCS;
+        else if (cn1 == 1 && cn2 == 2)
+            return FwdRealToComplex;
+        else if (cn1 == 2 && cn2 == 2)
+            return FwdComplexToComplex;
+    }
     else
-        _dst.create( src.size(), type );
+    {
+        if (cn1 == 1 && cn2 == 1)
+            return InvCCSToReal;
+        else if (cn1 == 2 && cn2 == 1)
+            return InvComplexToReal;
+        else if (cn1 == 2 && cn2 == 2)
+            return InvComplexToComplex;
+    }
+    return InvalidDft;
+}
 
-    Mat dst = _dst.getMat();
+
+inline DftDims determineDims(int rows, int cols, bool isRowWise, bool isContinuous)
+{
+    // printf("%d x %d (%d, %d)\n", rows, cols, isRowWise, isContinuous);
+    if (isRowWise)
+        return OneDim;
+    if (cols == 1 && rows > 1) // one-column-shaped input
+    {
+        if (isContinuous)
+            return OneDim;
+        else
+            return OneDimColWise;
+    }
+    if (rows == 1)
+        return OneDim;
+    if (cols > 1 && rows > 1)
+        return TwoDims;
+    return InvalidDim;
+}
+
+class OcvDftImpl
+{
+protected:
+    hal::DftContext contextA;
+    hal::DftContext contextB;
+    bool needBufferA;
+    bool needBufferB;
+    bool inv;
+    int width;
+    int height;
+    DftMode mode;
+    int elem_size;
+    int complex_elem_size;
+    int depth;
+    bool real_transform;
+    int nonzero_rows;
+    bool isRowTransform;
+    bool isScaled;
+    std::vector<int> stages;
+    bool useIpp;
+    int src_channels;
+    int dst_channels;
+
+    AutoBuffer<uchar> tmp_bufA;
+    AutoBuffer<uchar> tmp_bufB;
+    AutoBuffer<uchar> buf0;
+    AutoBuffer<uchar> buf1;
+
+public:
+    OcvDftImpl()
+    {
+        needBufferA = false;
+        needBufferB = false;
+        inv = false;
+        width = 0;
+        height = 0;
+        mode = InvalidDft;
+        elem_size = 0;
+        complex_elem_size = 0;
+        depth = 0;
+        real_transform = false;
+        nonzero_rows = 0;
+        isRowTransform = false;
+        isScaled = false;
+        useIpp = false;
+        src_channels = 0;
+        dst_channels = 0;
+    }
+
+    void init(int _width, int _height, int _depth, int _src_channels, int _dst_channels, int flags, int _nonzero_rows)
+    {
+        bool isComplex = _src_channels != _dst_channels;
+        nonzero_rows = _nonzero_rows;
+        width = _width;
+        height = _height;
+        depth = _depth;
+        src_channels = _src_channels;
+        dst_channels = _dst_channels;
+        bool isInverse = (flags & CV_HAL_DFT_INVERSE) != 0;
+        bool isInplace = (flags & CV_HAL_DFT_IS_INPLACE) != 0;
+        bool isContinuous = (flags & CV_HAL_DFT_IS_CONTINUOUS) != 0;
+        mode = determineMode(isInverse, _src_channels, _dst_channels);
+        inv = isInverse;
+        isRowTransform = (flags & CV_HAL_DFT_ROWS) != 0;
+        isScaled = (flags & CV_HAL_DFT_SCALE) != 0;
+        needBufferA = false;
+        needBufferB = false;
+        real_transform = (mode != FwdComplexToComplex && mode != InvComplexToComplex);
+
+        elem_size = (depth == CV_32F) ? sizeof(float) : sizeof(double);
+        complex_elem_size = elem_size * 2;
+        if( !real_transform )
+            elem_size = complex_elem_size;
 
 #if defined USE_IPP_DFT
-    CV_IPP_CHECK()
-    {
-        if ((src.depth() == CV_32F) && (src.total()>(int)(1<<6)) && nonzero_rows == 0)
+        CV_IPP_CHECK()
         {
-            if ((flags & DFT_ROWS) == 0)
+            if (nonzero_rows == 0 && depth == CV_32F && ((width * height)>(int)(1<<6)))
             {
-                if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
+                if (mode == FwdComplexToComplex || mode == InvComplexToComplex || mode == FwdRealToCCS || mode == InvCCSToReal)
                 {
-                    if (ippi_DFT_C_32F(src, dst, inv, ipp_norm_flag))
+                    useIpp = true;
+                    return;
+                }
+            }
+        }
+#endif
+
+        DftDims dims = determineDims(height, width, isRowTransform, isContinuous);
+        if (dims == TwoDims)
+        {
+            stages.resize(2);
+            if (mode == InvCCSToReal || mode == InvComplexToReal)
+            {
+                stages[0] = 1;
+                stages[1] = 0;
+            }
+            else
+            {
+                stages[0] = 0;
+                stages[1] = 1;
+            }
+        }
+        else
+        {
+            stages.resize(1);
+            if (dims == OneDimColWise)
+                stages[0] = 1;
+            else
+                stages[0] = 0;
+        }
+
+        for(uint stageIndex = 0; stageIndex < stages.size(); ++stageIndex)
+        {
+            if (stageIndex == 1)
+            {
+                isInplace = true;
+                isComplex = false;
+            }
+
+            int stage = stages[stageIndex];
+            bool isLastStage = (stageIndex + 1 == stages.size());
+
+            int len, count;
+
+            int f = 0;
+            if (inv)
+                f |= CV_HAL_DFT_INVERSE;
+            if (isScaled)
+                f |= CV_HAL_DFT_SCALE;
+            if (isRowTransform)
+                f |= CV_HAL_DFT_ROWS;
+            if (isComplex)
+                f |= CV_HAL_DFT_COMPLEX_OUTPUT;
+            if (real_transform)
+                f |= CV_HAL_DFT_REAL_OUTPUT;
+            if (!isLastStage)
+                f |= CV_HAL_DFT_TWO_STAGE;
+
+            if( stage == 0 ) // row-wise transform
+            {
+                if (width == 1 && !isRowTransform )
+                {
+                    len = height;
+                    count = width;
+                }
+                else
+                {
+                    len = width;
+                    count = height;
+                }
+                needBufferA = isInplace;
+                hal::dftInit(contextA, len, count, depth, f, &needBufferA);
+                if (needBufferA)
+                    tmp_bufA.allocate(len * complex_elem_size);
+            }
+            else
+            {
+                len = height;
+                count = width;
+                f |= CV_HAL_DFT_STAGE_COLS;
+                needBufferB = isInplace;
+                hal::dftInit(contextB, len, count, depth, f, &needBufferB);
+                if (needBufferB)
+                    tmp_bufB.allocate(len * complex_elem_size);
+
+                buf0.allocate(len * complex_elem_size);
+                buf1.allocate(len * complex_elem_size);
+            }
+        }
+    }
+
+    void run(uchar * src, int src_step, uchar * dst, int dst_step)
+    {
+#if defined USE_IPP_DFT
+        if (useIpp)
+        {
+            int ipp_norm_flag = !isScaled ? 8 : inv ? 2 : 1;
+            if (!isRowTransform)
+            {
+                if (mode == FwdComplexToComplex || mode == InvComplexToComplex)
+                {
+                    if (ippi_DFT_C_32F(src, src_step, dst, dst_step, width, height, inv, ipp_norm_flag))
                     {
                         CV_IMPL_ADD(CV_IMPL_IPP);
                         return;
                     }
                     setIppErrorStatus();
                 }
-                if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
+                else if (mode == FwdRealToCCS || mode == InvCCSToReal)
                 {
-                    if (ippi_DFT_R_32F(src, dst, inv, ipp_norm_flag))
+                    if (ippi_DFT_R_32F(src, src_step, dst, dst_step, width, height, inv, ipp_norm_flag))
                     {
                         CV_IMPL_ADD(CV_IMPL_IPP);
                         return;
@@ -2540,20 +2812,20 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
             }
             else
             {
-                if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
+                if (mode == FwdComplexToComplex || mode == InvComplexToComplex)
                 {
                     ippiDFT_C_Func ippiFunc = inv ? (ippiDFT_C_Func)ippiDFTInv_CToC_32fc_C1R : (ippiDFT_C_Func)ippiDFTFwd_CToC_32fc_C1R;
-                    if (Dft_C_IPPLoop(src, dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
+                    if (Dft_C_IPPLoop(src, src_step, dst, dst_step, width, height, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
                     {
                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
                         return;
                     }
                     setIppErrorStatus();
                 }
-                if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
+                else if (mode == FwdRealToCCS || mode == InvCCSToReal)
                 {
                     ippiDFT_R_Func ippiFunc = inv ? (ippiDFT_R_Func)ippiDFTInv_PackToR_32f_C1R : (ippiDFT_R_Func)ippiDFTFwd_RToPack_32f_C1R;
-                    if (Dft_R_IPPLoop(src, dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
+                    if (Dft_R_IPPLoop(src, src_step, dst, dst_step, width, height, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
                     {
                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
                         return;
@@ -2561,57 +2833,269 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
                     setIppErrorStatus();
                 }
             }
+            return;
         }
-    }
 #endif
 
-    if( !real_transform )
-        elem_size = complex_elem_size;
-
-    if( src.cols == 1 && nonzero_rows > 0 )
-        CV_Error( CV_StsNotImplemented,
-        "This mode (using nonzero_rows with a single-column matrix) breaks the function's logic, so it is prohibited.\n"
-        "For fast convolution/correlation use 2-column matrix or single-row matrix instead" );
-
-    // determine, which transform to do first - row-wise
-    // (stage 0) or column-wise (stage 1) transform
-    if( !(flags & DFT_ROWS) && src.rows > 1 &&
-        ((src.cols == 1 && (!src.isContinuous() || !dst.isContinuous())) ||
-         (src.cols > 1 && inv && real_transform)) )
-        stage = 1;
-
-    for(;;)
-    {
-        double scale = 1;
-        uchar* wave = 0;
-        int* itab = 0;
-        uchar* ptr;
-        int i, len, count, sz = 0;
-        int use_buf = 0, odd_real = 0;
-        DFTFunc dft_func;
-
-        if( stage == 0 ) // row-wise transform
+        for(uint stageIndex = 0; stageIndex < stages.size(); ++stageIndex)
         {
-            len = !inv ? src.cols : dst.cols;
-            count = src.rows;
-            if( len == 1 && !(flags & DFT_ROWS) )
+            int stage_src_channels = src_channels;
+            int stage_dst_channels = dst_channels;
+
+            if (stageIndex == 1)
             {
-                len = !inv ? src.rows : dst.rows;
-                count = 1;
+                src = dst;
+                src_step = dst_step;
+                stage_src_channels = stage_dst_channels;
             }
-            odd_real = real_transform && (len & 1);
+
+            int stage = stages[stageIndex];
+            bool isLastStage = (stageIndex + 1 == stages.size());
+            bool isComplex = stage_src_channels != stage_dst_channels;
+
+            if( stage == 0 )
+                rowDft(src, src_step, dst, dst_step, isComplex, isLastStage);
+            else
+                colDft(src, src_step, dst, dst_step, stage_src_channels, stage_dst_channels, isLastStage);
+        }
+    }
+
+    void free()
+    {
+        if (useIpp)
+            return;
+        hal::dftFree(contextA);
+        hal::dftFree(contextB);
+    }
+
+protected:
+
+    void rowDft(uchar* src_data, int src_step, uchar* dst_data, int dst_step, bool isComplex, bool isLastStage)
+    {
+        int len, count;
+        if (width == 1 && !isRowTransform )
+        {
+            len = height;
+            count = width;
         }
         else
         {
-            len = dst.rows;
-            count = !inv ? src0.cols : dst.cols;
-            sz = 2*len*complex_elem_size;
+            len = width;
+            count = height;
+        }
+        int dptr_offset = 0;
+        int dst_full_len = len*elem_size;
+
+        if( needBufferA )
+        {
+            if (mode == FwdRealToCCS && (len & 1) && len > 1)
+                dptr_offset = elem_size;
         }
 
-        void *spec = 0;
-#ifdef USE_IPP_DFT
-        if( CV_IPP_CHECK_COND && (len*count >= 64) ) // use IPP DFT if available
+        if( !inv && isComplex )
+            dst_full_len += (len & 1) ? elem_size : complex_elem_size;
+
+        int nz = nonzero_rows;
+        if( nz <= 0 || nz > count )
+            nz = count;
+
+        int i;
+        for( i = 0; i < nz; i++ )
         {
+            const uchar* sptr = src_data + src_step * i;
+            uchar* dptr0 = dst_data + dst_step * i;
+            uchar* dptr = dptr0;
+
+            if( needBufferA )
+                dptr = tmp_bufA;
+
+            hal::dftRun(contextA, sptr, dptr);
+
+            if( needBufferA )
+                memcpy( dptr0, dptr + dptr_offset, dst_full_len );
+        }
+
+        for( ; i < count; i++ )
+        {
+            uchar* dptr0 = dst_data + dst_step * i;
+            memset( dptr0, 0, dst_full_len );
+        }
+        if(isLastStage &&  mode == FwdRealToComplex)
+            complementComplexOutput(depth, dst_data, dst_step, len, nz, 1);
+    }
+
+    void colDft(uchar* src_data, int src_step, uchar* dst_data, int dst_step, int stage_src_channels, int stage_dst_channels, bool isLastStage)
+    {
+        int len = height;
+        int count = width;
+        int a = 0, b = count;
+        uchar *dbuf0, *dbuf1;
+        const uchar* sptr0 = src_data;
+        uchar* dptr0 = dst_data;
+
+        dbuf0 = buf0, dbuf1 = buf1;
+
+        if( needBufferB )
+        {
+            dbuf1 = tmp_bufB;
+            dbuf0 = buf1;
+        }
+
+        if( real_transform )
+        {
+            int even;
+            a = 1;
+            even = (count & 1) == 0;
+            b = (count+1)/2;
+            if( !inv )
+            {
+                memset( buf0, 0, len*complex_elem_size );
+                CopyColumn( sptr0, src_step, buf0, complex_elem_size, len, elem_size );
+                sptr0 += stage_dst_channels*elem_size;
+                if( even )
+                {
+                    memset( buf1, 0, len*complex_elem_size );
+                    CopyColumn( sptr0 + (count-2)*elem_size, src_step,
+                                buf1, complex_elem_size, len, elem_size );
+                }
+            }
+            else if( stage_src_channels == 1 )
+            {
+                CopyColumn( sptr0, src_step, buf0, elem_size, len, elem_size );
+                ExpandCCS( buf0, len, elem_size );
+                if( even )
+                {
+                    CopyColumn( sptr0 + (count-1)*elem_size, src_step,
+                                buf1, elem_size, len, elem_size );
+                    ExpandCCS( buf1, len, elem_size );
+                }
+                sptr0 += elem_size;
+            }
+            else
+            {
+                CopyColumn( sptr0, src_step, buf0, complex_elem_size, len, complex_elem_size );
+                if( even )
+                {
+                    CopyColumn( sptr0 + b*complex_elem_size, src_step,
+                                   buf1, complex_elem_size, len, complex_elem_size );
+                }
+                sptr0 += complex_elem_size;
+            }
+
+            if( even )
+                hal::dftRun(contextB, buf1, dbuf1);
+            hal::dftRun(contextB, buf0, dbuf0);
+
+            if( stage_dst_channels == 1 )
+            {
+                if( !inv )
+                {
+                    // copy the half of output vector to the first/last column.
+                    // before doing that, defgragment the vector
+                    memcpy( dbuf0 + elem_size, dbuf0, elem_size );
+                    CopyColumn( dbuf0 + elem_size, elem_size, dptr0,
+                                   dst_step, len, elem_size );
+                    if( even )
+                    {
+                        memcpy( dbuf1 + elem_size, dbuf1, elem_size );
+                        CopyColumn( dbuf1 + elem_size, elem_size,
+                                       dptr0 + (count-1)*elem_size,
+                                       dst_step, len, elem_size );
+                    }
+                    dptr0 += elem_size;
+                }
+                else
+                {
+                    // copy the real part of the complex vector to the first/last column
+                    CopyColumn( dbuf0, complex_elem_size, dptr0, dst_step, len, elem_size );
+                    if( even )
+                        CopyColumn( dbuf1, complex_elem_size, dptr0 + (count-1)*elem_size,
+                                       dst_step, len, elem_size );
+                    dptr0 += elem_size;
+                }
+            }
+            else
+            {
+                assert( !inv );
+                CopyColumn( dbuf0, complex_elem_size, dptr0,
+                               dst_step, len, complex_elem_size );
+                if( even )
+                    CopyColumn( dbuf1, complex_elem_size,
+                                   dptr0 + b*complex_elem_size,
+                                   dst_step, len, complex_elem_size );
+                dptr0 += complex_elem_size;
+            }
+        }
+
+        for(int i = a; i < b; i += 2 )
+        {
+            if( i+1 < b )
+            {
+                CopyFrom2Columns( sptr0, src_step, buf0, buf1, len, complex_elem_size );
+                hal::dftRun(contextB, buf1, dbuf1);
+            }
+            else
+                CopyColumn( sptr0, src_step, buf0, complex_elem_size, len, complex_elem_size );
+
+            hal::dftRun(contextB, buf0, dbuf0);
+
+            if( i+1 < b )
+                CopyTo2Columns( dbuf0, dbuf1, dptr0, dst_step, len, complex_elem_size );
+            else
+                CopyColumn( dbuf0, complex_elem_size, dptr0, dst_step, len, complex_elem_size );
+            sptr0 += 2*complex_elem_size;
+            dptr0 += 2*complex_elem_size;
+        }
+        if(isLastStage && mode == FwdRealToComplex)
+            complementComplexOutput(depth, dst_data, dst_step, count, len, 2);
+    }
+};
+
+class OcvDftBasicImpl
+{
+public:
+    OcvDftOptions opt;
+    int _factors[34];
+    AutoBuffer<uchar> wave_buf;
+    AutoBuffer<int> itab_buf;
+#ifdef USE_IPP_DFT
+    AutoBuffer<uchar> ippbuf;
+    AutoBuffer<uchar> ippworkbuf;
+#endif
+
+public:
+    OcvDftBasicImpl()
+    {
+        opt.factors = _factors;
+    }
+    OcvDftBasicImpl & operator=(const OcvDftBasicImpl & other)
+    {
+        this->opt = other.opt;
+        return *this;
+    }
+    void init(int len, int count, int depth, int flags, bool *needBuffer)
+    {
+        int prev_len = opt.n;
+
+        int stage = (flags & CV_HAL_DFT_STAGE_COLS) != 0 ? 1 : 0;
+        int complex_elem_size = depth == CV_32F ? sizeof(Complex<float>) : sizeof(Complex<double>);
+        opt.isInverse = (flags & CV_HAL_DFT_INVERSE) != 0;
+        bool real_transform = (flags & CV_HAL_DFT_REAL_OUTPUT) != 0;
+        opt.isComplex = (stage == 0) && (flags & CV_HAL_DFT_COMPLEX_OUTPUT) != 0;
+        bool needAnotherStage = (flags & CV_HAL_DFT_TWO_STAGE) != 0;
+
+        opt.scale = 1;
+        opt.tab_size = len;
+        opt.n = len;
+
+        opt.useIpp = false;
+    #ifdef USE_IPP_DFT
+        opt.ipp_spec = 0;
+        opt.ipp_work = 0;
+
+        if( CV_IPP_CHECK_COND && (opt.n*count >= 64) ) // use IPP DFT if available
+        {
+            int ipp_norm_flag = (flags & CV_HAL_DFT_SCALE) == 0 ? 8 : opt.isInverse ? 2 : 1;
             int specsize=0, initsize=0, worksize=0;
             IppDFTGetSizeFunc getSizeFunc = 0;
             IppDFTInitFunc initFunc = 0;
@@ -2642,260 +3126,266 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
                     initFunc = (IppDFTInitFunc)ippsDFTInit_C_64fc;
                 }
             }
-            if( getSizeFunc(len, ipp_norm_flag, ippAlgHintNone, &specsize, &initsize, &worksize) >= 0 )
+            if( getSizeFunc(opt.n, ipp_norm_flag, ippAlgHintNone, &specsize, &initsize, &worksize) >= 0 )
             {
                 ippbuf.allocate(specsize + initsize + 64);
-                spec = alignPtr(&ippbuf[0], 32);
-                uchar* initbuf = alignPtr((uchar*)spec + specsize, 32);
-                if( initFunc(len, ipp_norm_flag, ippAlgHintNone, spec, initbuf) < 0 )
-                    spec = 0;
-                sz += worksize;
+                opt.ipp_spec = alignPtr(&ippbuf[0], 32);
+                ippworkbuf.allocate(worksize + 32);
+                opt.ipp_work = alignPtr(&ippworkbuf[0], 32);
+                uchar* initbuf = alignPtr((uchar*)opt.ipp_spec + specsize, 32);
+                if( initFunc(opt.n, ipp_norm_flag, ippAlgHintNone, opt.ipp_spec, initbuf) >= 0 )
+                    opt.useIpp = true;
             }
             else
                 setIppErrorStatus();
         }
-        else
-#endif
+    #endif
+
+        if (!opt.useIpp)
         {
-            if( len != prev_len )
-                nf = DFTFactorize( len, factors );
-
-            inplace_transform = factors[0] == factors[nf-1];
-            sz += len*(complex_elem_size + sizeof(int));
-            i = nf > 1 && (factors[0] & 1) == 0;
-            if( (factors[i] & 1) != 0 && factors[i] > 5 )
-                sz += (factors[i]+1)*complex_elem_size;
-
-            if( (stage == 0 && ((src.data == dst.data && !inplace_transform) || odd_real)) ||
-                (stage == 1 && !inplace_transform) )
+            if (len != prev_len)
             {
-                use_buf = 1;
-                sz += len*complex_elem_size;
+                opt.nf = DFTFactorize( opt.n, opt.factors );
+            }
+            bool inplace_transform = opt.factors[0] == opt.factors[opt.nf-1];
+            if (len != prev_len || (!inplace_transform && opt.isInverse && real_transform))
+            {
+                wave_buf.allocate(opt.n*complex_elem_size);
+                opt.wave = wave_buf;
+                itab_buf.allocate(opt.n);
+                opt.itab = itab_buf;
+                DFTInit( opt.n, opt.nf, opt.factors, opt.itab, complex_elem_size,
+                         opt.wave, stage == 0 && opt.isInverse && real_transform );
             }
-        }
-
-        ptr = (uchar*)buf;
-        buf.allocate( sz + 32 );
-        if( ptr != (uchar*)buf )
-            prev_len = 0; // because we release the buffer,
-                          // force recalculation of
-                          // twiddle factors and permutation table
-        ptr = (uchar*)buf;
-        if( !spec )
-        {
-            wave = ptr;
-            ptr += len*complex_elem_size;
-            itab = (int*)ptr;
-            ptr = (uchar*)cvAlignPtr( ptr + len*sizeof(int), 16 );
-
-            if( len != prev_len || (!inplace_transform && inv && real_transform))
-                DFTInit( len, nf, factors, itab, complex_elem_size,
-                            wave, stage == 0 && inv && real_transform );
             // otherwise reuse the tables calculated on the previous stage
-        }
-
-        if( stage == 0 )
-        {
-            uchar* tmp_buf = 0;
-            int dptr_offset = 0;
-            int dst_full_len = len*elem_size;
-            int _flags = (int)inv + (src.channels() != dst.channels() ?
-                         DFT_COMPLEX_INPUT_OR_OUTPUT : 0);
-            if( use_buf )
+            if (needBuffer)
             {
-                tmp_buf = ptr;
-                ptr += len*complex_elem_size;
-                if( odd_real && !inv && len > 1 &&
-                    !(_flags & DFT_COMPLEX_INPUT_OR_OUTPUT))
-                    dptr_offset = elem_size;
+                if( (stage == 0 && ((*needBuffer && !inplace_transform) || (real_transform && (len & 1)))) ||
+                    (stage == 1 && !inplace_transform) )
+                {
+                    *needBuffer = true;
+                }
             }
-
-            if( !inv && (_flags & DFT_COMPLEX_INPUT_OR_OUTPUT) )
-                dst_full_len += (len & 1) ? elem_size : complex_elem_size;
-
-            dft_func = dft_tbl[(!real_transform ? 0 : !inv ? 1 : 2) + (depth == CV_64F)*3];
-
-            if( count > 1 && !(flags & DFT_ROWS) && (!inv || !real_transform) )
-                stage = 1;
-            else if( flags & CV_DXT_SCALE )
-                scale = 1./(len * (flags & DFT_ROWS ? 1 : count));
-
-            if( nonzero_rows <= 0 || nonzero_rows > count )
-                nonzero_rows = count;
-
-            for( i = 0; i < nonzero_rows; i++ )
-            {
-                const uchar* sptr = src.ptr(i);
-                uchar* dptr0 = dst.ptr(i);
-                uchar* dptr = dptr0;
-
-                if( tmp_buf )
-                    dptr = tmp_buf;
-
-                dft_func( sptr, dptr, len, nf, factors, itab, wave, len, spec, ptr, _flags, scale );
-                if( dptr != dptr0 )
-                    memcpy( dptr0, dptr + dptr_offset, dst_full_len );
-            }
-
-            for( ; i < count; i++ )
-            {
-                uchar* dptr0 = dst.ptr(i);
-                memset( dptr0, 0, dst_full_len );
-            }
-
-            if( stage != 1 )
-            {
-                if( !inv && real_transform && dst.channels() == 2 )
-                    complementComplexOutput(dst, nonzero_rows, 1);
-                break;
-            }
-            src = dst;
         }
         else
         {
-            int a = 0, b = count;
-            uchar *buf0, *buf1, *dbuf0, *dbuf1;
-            const uchar* sptr0 = src.ptr();
-            uchar* dptr0 = dst.ptr();
-            buf0 = ptr;
-            ptr += len*complex_elem_size;
-            buf1 = ptr;
-            ptr += len*complex_elem_size;
-            dbuf0 = buf0, dbuf1 = buf1;
-
-            if( use_buf )
+            if (needBuffer)
             {
-                dbuf1 = ptr;
-                dbuf0 = buf1;
-                ptr += len*complex_elem_size;
+                *needBuffer = false;
             }
+        }
 
-            dft_func = dft_tbl[(depth == CV_64F)*3];
-
-            if( real_transform && inv && src.cols > 1 )
-                stage = 0;
-            else if( flags & CV_DXT_SCALE )
-                scale = 1./(len * count);
-
-            if( real_transform )
+        {
+            static DFTFunc dft_tbl[6] =
             {
-                int even;
-                a = 1;
-                even = (count & 1) == 0;
-                b = (count+1)/2;
-                if( !inv )
+                (DFTFunc)DFT_32f,
+                (DFTFunc)RealDFT_32f,
+                (DFTFunc)CCSIDFT_32f,
+                (DFTFunc)DFT_64f,
+                (DFTFunc)RealDFT_64f,
+                (DFTFunc)CCSIDFT_64f
+            };
+            int idx = 0;
+            if (stage == 0)
+            {
+                if (real_transform)
                 {
-                    memset( buf0, 0, len*complex_elem_size );
-                    CopyColumn( sptr0, src.step, buf0, complex_elem_size, len, elem_size );
-                    sptr0 += dst.channels()*elem_size;
-                    if( even )
-                    {
-                        memset( buf1, 0, len*complex_elem_size );
-                        CopyColumn( sptr0 + (count-2)*elem_size, src.step,
-                                    buf1, complex_elem_size, len, elem_size );
-                    }
-                }
-                else if( src.channels() == 1 )
-                {
-                    CopyColumn( sptr0, src.step, buf0, elem_size, len, elem_size );
-                    ExpandCCS( buf0, len, elem_size );
-                    if( even )
-                    {
-                        CopyColumn( sptr0 + (count-1)*elem_size, src.step,
-                                    buf1, elem_size, len, elem_size );
-                        ExpandCCS( buf1, len, elem_size );
-                    }
-                    sptr0 += elem_size;
-                }
-                else
-                {
-                    CopyColumn( sptr0, src.step, buf0, complex_elem_size, len, complex_elem_size );
-                    if( even )
-                    {
-                        CopyColumn( sptr0 + b*complex_elem_size, src.step,
-                                       buf1, complex_elem_size, len, complex_elem_size );
-                    }
-                    sptr0 += complex_elem_size;
-                }
-
-                if( even )
-                    dft_func( buf1, dbuf1, len, nf, factors, itab,
-                              wave, len, spec, ptr, inv, scale );
-                dft_func( buf0, dbuf0, len, nf, factors, itab,
-                          wave, len, spec, ptr, inv, scale );
-
-                if( dst.channels() == 1 )
-                {
-                    if( !inv )
-                    {
-                        // copy the half of output vector to the first/last column.
-                        // before doing that, defgragment the vector
-                        memcpy( dbuf0 + elem_size, dbuf0, elem_size );
-                        CopyColumn( dbuf0 + elem_size, elem_size, dptr0,
-                                       dst.step, len, elem_size );
-                        if( even )
-                        {
-                            memcpy( dbuf1 + elem_size, dbuf1, elem_size );
-                            CopyColumn( dbuf1 + elem_size, elem_size,
-                                           dptr0 + (count-1)*elem_size,
-                                           dst.step, len, elem_size );
-                        }
-                        dptr0 += elem_size;
-                    }
+                    if (!opt.isInverse)
+                        idx = 1;
                     else
-                    {
-                        // copy the real part of the complex vector to the first/last column
-                        CopyColumn( dbuf0, complex_elem_size, dptr0, dst.step, len, elem_size );
-                        if( even )
-                            CopyColumn( dbuf1, complex_elem_size, dptr0 + (count-1)*elem_size,
-                                           dst.step, len, elem_size );
-                        dptr0 += elem_size;
-                    }
-                }
-                else
-                {
-                    assert( !inv );
-                    CopyColumn( dbuf0, complex_elem_size, dptr0,
-                                   dst.step, len, complex_elem_size );
-                    if( even )
-                        CopyColumn( dbuf1, complex_elem_size,
-                                       dptr0 + b*complex_elem_size,
-                                       dst.step, len, complex_elem_size );
-                    dptr0 += complex_elem_size;
+                        idx = 2;
                 }
             }
+            if (depth == CV_64F)
+                idx += 3;
 
-            for( i = a; i < b; i += 2 )
-            {
-                if( i+1 < b )
-                {
-                    CopyFrom2Columns( sptr0, src.step, buf0, buf1, len, complex_elem_size );
-                    dft_func( buf1, dbuf1, len, nf, factors, itab,
-                              wave, len, spec, ptr, inv, scale );
-                }
-                else
-                    CopyColumn( sptr0, src.step, buf0, complex_elem_size, len, complex_elem_size );
+            opt.dft_func = dft_tbl[idx];
+        }
 
-                dft_func( buf0, dbuf0, len, nf, factors, itab,
-                          wave, len, spec, ptr, inv, scale );
-
-                if( i+1 < b )
-                    CopyTo2Columns( dbuf0, dbuf1, dptr0, dst.step, len, complex_elem_size );
-                else
-                    CopyColumn( dbuf0, complex_elem_size, dptr0, dst.step, len, complex_elem_size );
-                sptr0 += 2*complex_elem_size;
-                dptr0 += 2*complex_elem_size;
-            }
-
-            if( stage != 0 )
-            {
-                if( !inv && real_transform && dst.channels() == 2 && len > 1 )
-                    complementComplexOutput(dst, len, 2);
-                break;
-            }
-            src = dst;
+        if(!needAnotherStage && (flags & CV_HAL_DFT_SCALE) != 0)
+        {
+            int rowCount = count;
+            if (stage == 0 && (flags & CV_HAL_DFT_ROWS) != 0)
+                rowCount = 1;
+            opt.scale = 1./(len * rowCount);
         }
     }
+
+    void run(const void * src, void * dst)
+    {
+        opt.dft_func(opt, src, dst);
+    }
+
+    void free() {}
+};
+
+namespace hal {
+
+//================== 1D ======================
+
+void dftInit(DftContext & context, int len, int count, int depth, int flags, bool *needBuffer)
+{
+    int res = cv_hal_dftInit(&context.impl, len, count, depth, flags, needBuffer);
+    if (res == CV_HAL_ERROR_OK)
+    {
+        context.useReplacement = true;
+        return;
+    }
+
+    context.useReplacement = false;
+    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
+    if (!c)
+    {
+        c = new OcvDftBasicImpl();
+        context.impl = (void*)c;
+    }
+    c->init(len, count, depth, flags, needBuffer);
+}
+
+void dftRun(const DftContext & context, const void * src, void * dst)
+{
+    if (context.useReplacement)
+    {
+        int res = cv_hal_dftRun(context.impl, src, dst);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun");
+        }
+        return;
+    }
+    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
+    c->run(src, dst);
+}
+
+void dftFree(DftContext & context)
+{
+    if (context.useReplacement)
+    {
+        int res = cv_hal_dftFree(context.impl);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftFree");
+        }
+        return;
+    }
+
+    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
+    if (c)
+    {
+        c->free();
+        delete c;
+        context.impl = 0;
+    }
+}
+
+
+//================== 2D ======================
+
+void dftInit2D(DftContext & c,
+               int _width, int _height, int _depth, int _src_channels, int _dst_channels,
+               int flags,
+               int _nonzero_rows)
+{
+    int res = cv_hal_dftInit2D(&c.impl, _width, _height, _depth, _src_channels, _dst_channels, flags, _nonzero_rows);
+    if (res == CV_HAL_ERROR_OK)
+    {
+        c.useReplacement = true;
+        return;
+    }
+    c.useReplacement = false;
+
+    if( _width == 1 && _nonzero_rows > 0 )
+        CV_Error( CV_StsNotImplemented,
+        "This mode (using nonzero_rows with a single-column matrix) breaks the function's logic, so it is prohibited.\n"
+        "For fast convolution/correlation use 2-column matrix or single-row matrix instead" );
+
+    OcvDftImpl * d = new OcvDftImpl();
+    d->init(_width, _height, _depth, _src_channels, _dst_channels, flags, _nonzero_rows);
+    c.impl = (void*)d;
+}
+
+void dftRun2D(const DftContext & c,
+              const void * src, int src_step, void * dst, int dst_step)
+{
+    if (c.useReplacement)
+    {
+        int res = cv_hal_dftRun2D(c.impl, (uchar*)src, src_step, (uchar*)dst, dst_step);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun2D");
+        }
+        return;
+    }
+    OcvDftImpl * d = (OcvDftImpl*)c.impl;
+    d->run((uchar*)src, src_step, (uchar*)dst, dst_step);
+}
+
+void dftFree2D(DftContext & c)
+{
+    if (c.useReplacement)
+    {
+        int res = cv_hal_dftFree2D(c.impl);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftFree2D");
+        }
+        return;
+    }
+    OcvDftImpl * d = (OcvDftImpl*)c.impl;
+    d->free();
+    delete d;
+    c.impl = 0;
+}
+
+} // cv::hal::
+
+} // cv::
+
+
+void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
+{
+#ifdef HAVE_CLAMDFFT
+    CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
+            _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
+               ocl_dft_amdfft(_src0, _dst, flags))
+#endif
+
+#ifdef HAVE_OPENCL
+    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
+               ocl_dft(_src0, _dst, flags, nonzero_rows))
+#endif
+
+    Mat src0 = _src0.getMat(), src = src0;
+    bool inv = (flags & DFT_INVERSE) != 0;
+    int type = src.type();
+    int depth = src.depth();
+
+    CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );
+
+    if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
+        _dst.create( src.size(), CV_MAKETYPE(depth, 2) );
+    else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
+        _dst.create( src.size(), depth );
+    else
+        _dst.create( src.size(), type );
+
+    Mat dst = _dst.getMat();
+
+    int f = 0;
+    if (src.isContinuous() && dst.isContinuous())
+        f |= CV_HAL_DFT_IS_CONTINUOUS;
+    if (inv)
+        f |= CV_HAL_DFT_INVERSE;
+    if (flags & DFT_ROWS)
+        f |= CV_HAL_DFT_ROWS;
+    if (flags & DFT_SCALE)
+        f |= CV_HAL_DFT_SCALE;
+    if (src.data == dst.data)
+        f |= CV_HAL_DFT_IS_INPLACE;
+    hal::DftContext c;
+    hal::dftInit2D(c, src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);
+    hal::dftRun2D(c, src.data, (int)src.step, dst.data, (int)dst.step);
+    hal::dftFree2D(c);
 }
 
 
@@ -3117,11 +3607,12 @@ namespace cv
    http://www.ece.utexas.edu/~bevans/courses/ee381k/lectures/09_DCT/lecture9/:
 */
 template<typename T> static void
-DCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
-     int n, int nf, int* factors, const int* itab, const Complex<T>* dft_wave,
-     const Complex<T>* dct_wave, const void* spec, Complex<T>* buf )
+DCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
+     const Complex<T>* dct_wave )
 {
     static const T sin_45 = (T)0.70710678118654752440084436210485;
+
+    int n = c.n;
     int j, n2 = n >> 1;
 
     src_step /= sizeof(src[0]);
@@ -3140,8 +3631,7 @@ DCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
         dft_src[n-j-1] = src[src_step];
     }
 
-    RealDFT( dft_src, dft_dst, n, nf, factors,
-             itab, dft_wave, n, spec, buf, 0, 1.0 );
+    RealDFT(c, dft_src, dft_dst);
     src = dft_dst;
 
     dst[0] = (T)(src[0]*dct_wave->re*sin_45);
@@ -3160,11 +3650,11 @@ DCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
 
 
 template<typename T> static void
-IDCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
-      int n, int nf, int* factors, const int* itab, const Complex<T>* dft_wave,
-      const Complex<T>* dct_wave, const void* spec, Complex<T>* buf )
+IDCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
+      const Complex<T>* dct_wave)
 {
     static const T sin_45 = (T)0.70710678118654752440084436210485;
+    int n = c.n;
     int j, n2 = n >> 1;
 
     src_step /= sizeof(src[0]);
@@ -3189,8 +3679,7 @@ IDCT( const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
     }
 
     dft_src[n-1] = (T)(src[0]*2*dct_wave->re);
-    CCSIDFT( dft_src, dft_dst, n, nf, factors, itab,
-             dft_wave, n, spec, buf, 0, 1.0 );
+    CCSIDFT(c, dft_src, dft_dst);
 
     for( j = 0; j < n2; j++, dst += dst_step*2 )
     {
@@ -3279,41 +3768,31 @@ DCTInit( int n, int elem_size, void* _wave, int inv )
 }
 
 
-typedef void (*DCTFunc)(const void* src, int src_step, void* dft_src,
-                        void* dft_dst, void* dst, int dst_step, int n,
-                        int nf, int* factors, const int* itab, const void* dft_wave,
-                        const void* dct_wave, const void* spec, void* buf );
+typedef void (*DCTFunc)(const OcvDftOptions & c, const void* src, int src_step, void* dft_src,
+                        void* dft_dst, void* dst, int dst_step, const void* dct_wave);
 
-static void DCT_32f(const float* src, int src_step, float* dft_src, float* dft_dst,
-                    float* dst, int dst_step, int n, int nf, int* factors, const int* itab,
-                    const Complexf* dft_wave, const Complexf* dct_wave, const void* spec, Complexf* buf )
+static void DCT_32f(const OcvDftOptions & c, const float* src, int src_step, float* dft_src, float* dft_dst,
+                    float* dst, int dst_step, const Complexf* dct_wave)
 {
-    DCT(src, src_step, dft_src, dft_dst, dst, dst_step,
-        n, nf, factors, itab, dft_wave, dct_wave, spec, buf);
+    DCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void IDCT_32f(const float* src, int src_step, float* dft_src, float* dft_dst,
-                    float* dst, int dst_step, int n, int nf, int* factors, const int* itab,
-                    const Complexf* dft_wave, const Complexf* dct_wave, const void* spec, Complexf* buf )
+static void IDCT_32f(const OcvDftOptions & c, const float* src, int src_step, float* dft_src, float* dft_dst,
+                    float* dst, int dst_step, const Complexf* dct_wave)
 {
-    IDCT(src, src_step, dft_src, dft_dst, dst, dst_step,
-         n, nf, factors, itab, dft_wave, dct_wave, spec, buf);
+    IDCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void DCT_64f(const double* src, int src_step, double* dft_src, double* dft_dst,
-                    double* dst, int dst_step, int n, int nf, int* factors, const int* itab,
-                    const Complexd* dft_wave, const Complexd* dct_wave, const void* spec, Complexd* buf )
+static void DCT_64f(const OcvDftOptions & c, const double* src, int src_step, double* dft_src, double* dft_dst,
+                    double* dst, int dst_step, const Complexd* dct_wave)
 {
-    DCT(src, src_step, dft_src, dft_dst, dst, dst_step,
-        n, nf, factors, itab, dft_wave, dct_wave, spec, buf);
+    DCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void IDCT_64f(const double* src, int src_step, double* dft_src, double* dft_dst,
-                     double* dst, int dst_step, int n, int nf, int* factors, const int* itab,
-                     const Complexd* dft_wave, const Complexd* dct_wave, const void* spec, Complexd* buf )
+static void IDCT_64f(const OcvDftOptions & c, const double* src, int src_step, double* dft_src, double* dft_dst,
+                     double* dst, int dst_step, const Complexd* dct_wave)
 {
-    IDCT(src, src_step, dft_src, dft_dst, dst, dst_step,
-         n, nf, factors, itab, dft_wave, dct_wave, spec, buf);
+    IDCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
 }
@@ -3336,8 +3815,8 @@ typedef IppStatus (CV_STDCALL * ippiDCTGetBufSize)(const void*, int*);
 class DctIPPLoop_Invoker : public ParallelLoopBody
 {
 public:
-    DctIPPLoop_Invoker(const Mat& _src, Mat& _dst, bool _inv, bool *_ok) :
-        ParallelLoopBody(), src(&_src), dst(&_dst), inv(_inv), ok(_ok)
+    DctIPPLoop_Invoker(const uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width, bool _inv, bool *_ok) :
+        ParallelLoopBody(), src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width), inv(_inv), ok(_ok)
     {
         *ok = true;
     }
@@ -3348,7 +3827,7 @@ public:
             return;
 
 #if IPP_VERSION_X100 >= 900
-        IppiSize srcRoiSize = {src->cols, 1};
+        IppiSize srcRoiSize = {width, 1};
 
         int specSize    = 0;
         int initSize    = 0;
@@ -3405,7 +3884,7 @@ public:
 
         for(int i = range.start; i < range.end; ++i)
         {
-            if(ippDctFun(src->ptr<float>(i), (int)src->step,dst->ptr<float>(i), (int)dst->step, pDCTSpec, pBuffer) < 0)
+            if(ippDctFun((float*)(src + src_step * i), src_step, (float*)(dst + dst_step * i), dst_step, pDCTSpec, pBuffer) < 0)
             {
                 *ok = false;
                 IPP_RETURN
@@ -3419,7 +3898,7 @@ public:
         uchar* pBuffer = 0;
         int bufSize=0;
 
-        IppiSize srcRoiSize = {src->cols, 1};
+        IppiSize srcRoiSize = {width, 1};
 
         CV_SUPPRESS_DEPRECATED_START
 
@@ -3435,7 +3914,7 @@ public:
 
             for( int i = range.start; i < range.end; ++i)
             {
-                if(ippDctFun(src->ptr<float>(i), (int)src->step,dst->ptr<float>(i), (int)dst->step, pDCTSpec, (Ipp8u*)pBuffer) < 0)
+                if(ippDctFun((float*)(src + src_step * i), src_step, (float*)(dst + dst_step * i), dst_step, pDCTSpec, (Ipp8u*)pBuffer) < 0)
                 {
                     *ok = false;
                     break;
@@ -3456,27 +3935,30 @@ public:
     }
 
 private:
-    const Mat* src;
-    Mat* dst;
+    const uchar * src;
+    int src_step;
+    uchar * dst;
+    int dst_step;
+    int width;
     bool inv;
     bool *ok;
 };
 
-static bool DctIPPLoop(const Mat& src, Mat& dst, bool inv)
+static bool DctIPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv)
 {
     bool ok;
-    parallel_for_(Range(0, src.rows), DctIPPLoop_Invoker(src, dst, inv, &ok), src.rows/(double)(1<<4) );
+    parallel_for_(Range(0, height), DctIPPLoop_Invoker(src, src_step, dst, dst_step, width, inv, &ok), height/(double)(1<<4) );
     return ok;
 }
 
-static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
+static bool ippi_DCT_32f(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, bool row)
 {
     if(row)
-        return DctIPPLoop(src, dst, inv);
+        return DctIPPLoop(src, src_step, dst, dst_step, width, height, inv);
     else
     {
 #if IPP_VERSION_X100 >= 900
-        IppiSize srcRoiSize = {src.cols, src.rows};
+        IppiSize srcRoiSize = {width, height};
 
         int specSize    = 0;
         int initSize    = 0;
@@ -3524,7 +4006,7 @@ static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
             return false;
         }
 
-        if(ippDctFun(src.ptr<float>(), (int)src.step,dst.ptr<float>(), (int)dst.step, pDCTSpec, pBuffer) < 0)
+        if(ippDctFun((float*)src, src_step, (float*)dst, dst_step, pDCTSpec, pBuffer) < 0)
         {
             IPP_RELEASE
             return false;
@@ -3540,7 +4022,7 @@ static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
         uchar* pBuffer = 0;
         int bufSize=0;
 
-        IppiSize srcRoiSize = {src.cols, src.rows};
+        IppiSize srcRoiSize = {width, height};
 
         CV_SUPPRESS_DEPRECATED_START
 
@@ -3556,7 +4038,7 @@ static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
             buf.allocate( bufSize );
             pBuffer = (uchar*)buf;
 
-            status = ippDctFun(src.ptr<float>(), (int)src.step, dst.ptr<float>(), (int)dst.step, pDCTSpec, (Ipp8u*)pBuffer);
+            status = ippDctFun((float*)src, src_step, (float*)dst, dst_step, pDCTSpec, (Ipp8u*)pBuffer);
         }
 
         if (pDCTSpec)
@@ -3574,140 +4056,219 @@ static bool ippi_DCT_32f(const Mat& src, Mat& dst, bool inv, bool row)
 }
 #endif
 
+namespace cv {
+
+class OcvDctImpl
+{
+public:
+    OcvDftOptions opt;
+
+    int _factors[34];
+    AutoBuffer<uint> wave_buf;
+    AutoBuffer<int> itab_buf;
+
+    DCTFunc dct_func;
+    bool isRowTransform;
+    bool isInverse;
+    bool isContinuous;
+    int start_stage;
+    int end_stage;
+    int width;
+    int height;
+    int depth;
+
+    void init(int _width, int _height, int _depth, int flags)
+    {
+        width = _width;
+        height = _height;
+        depth = _depth;
+        isInverse = (flags & CV_HAL_DFT_INVERSE) != 0;
+        isRowTransform = (flags & CV_HAL_DFT_ROWS) != 0;
+        isContinuous = (flags & CV_HAL_DFT_IS_CONTINUOUS) != 0;
+        static DCTFunc dct_tbl[4] =
+        {
+            (DCTFunc)DCT_32f,
+            (DCTFunc)IDCT_32f,
+            (DCTFunc)DCT_64f,
+            (DCTFunc)IDCT_64f
+        };
+        dct_func = dct_tbl[(int)isInverse + (depth == CV_64F)*2];
+        opt.nf = 0;
+        opt.isComplex = false;
+        opt.isInverse = false;
+        opt.noPermute = false;
+        opt.scale = 1.;
+        opt.factors = _factors;
+
+        if (isRowTransform || height == 1 || (width == 1 && isContinuous))
+        {
+            start_stage = end_stage = 0;
+        }
+        else
+        {
+            start_stage = (width == 1);
+            end_stage = 1;
+        }
+    }
+    void run(uchar * src, int src_step, uchar * dst, int dst_step)
+    {
+        CV_IPP_RUN(IPP_VERSION_X100 >= 700 && depth == CV_32F, ippi_DCT_32f(src, src_step, dst, dst_step, width, height, isInverse, isRowTransform))
+
+        AutoBuffer<uchar> dct_wave;
+        AutoBuffer<uchar> src_buf, dst_buf;
+        uchar *src_dft_buf = 0, *dst_dft_buf = 0;
+        int prev_len = 0;
+        int elem_size = (depth == CV_32F) ? sizeof(float) : sizeof(double);
+        int complex_elem_size = elem_size*2;
+
+        for(int stage = start_stage ; stage <= end_stage; stage++ )
+        {
+            const uchar* sptr = src;
+            uchar* dptr = dst;
+            size_t sstep0, sstep1, dstep0, dstep1;
+            int len, count;
+
+            if( stage == 0 )
+            {
+                len = width;
+                count = height;
+                if( len == 1 && !isRowTransform )
+                {
+                    len = height;
+                    count = 1;
+                }
+                sstep0 = src_step;
+                dstep0 = dst_step;
+                sstep1 = dstep1 = elem_size;
+            }
+            else
+            {
+                len = height;
+                count = width;
+                sstep1 = src_step;
+                dstep1 = dst_step;
+                sstep0 = dstep0 = elem_size;
+            }
+
+            opt.n = len;
+            opt.tab_size = len;
+
+            if( len != prev_len )
+            {
+                if( len > 1 && (len & 1) )
+                    CV_Error( CV_StsNotImplemented, "Odd-size DCT\'s are not implemented" );
+
+                opt.nf = DFTFactorize( len, opt.factors );
+                bool inplace_transform = opt.factors[0] == opt.factors[opt.nf-1];
+
+                wave_buf.allocate(len*complex_elem_size);
+                opt.wave = wave_buf;
+                itab_buf.allocate(len);
+                opt.itab = itab_buf;
+                DFTInit( len, opt.nf, opt.factors, opt.itab, complex_elem_size, opt.wave, isInverse );
+
+                dct_wave.allocate((len/2 + 1)*complex_elem_size);
+                src_buf.allocate(len*elem_size);
+                src_dft_buf = src_buf;
+                if(!inplace_transform)
+                {
+                    dst_buf.allocate(len*elem_size);
+                    dst_dft_buf = dst_buf;
+                }
+                else
+                {
+                    dst_dft_buf = src_buf;
+                }
+                DCTInit( len, complex_elem_size, dct_wave, isInverse);
+                prev_len = len;
+            }
+            // otherwise reuse the tables calculated on the previous stage
+            for(int i = 0; i < count; i++ )
+            {
+                dct_func( opt, sptr + i*sstep0, (int)sstep1, src_dft_buf, dst_dft_buf,
+                          dptr + i*dstep0, (int)dstep1, dct_wave);
+            }
+            src = dst;
+            src_step = dst_step;
+        }
+
+    }
+    void free() {}
+};
+
+namespace hal {
+
+void dctInit(DftContext & c, int width, int height, int depth, int flags)
+{
+    int res = cv_hal_dctInit(&c.impl, width, height, depth, flags);
+    if (res == CV_HAL_ERROR_OK)
+    {
+        c.useReplacement = true;
+        return;
+    }
+    c.useReplacement = false;
+    OcvDctImpl * impl = new OcvDctImpl();
+    impl->init(width, height, depth, flags);
+    c.impl = impl;
+}
+
+void dctRun(const DftContext & c, const void * src, int src_step, void * dst, int dst_step)
+{
+    if (c.useReplacement)
+    {
+        int res = cv_hal_dctRun(c.impl, src, src_step, dst, dst_step);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctRun");
+        }
+        return;
+    }
+    OcvDctImpl * impl = (OcvDctImpl*)c.impl;
+    impl->run((uchar*)src, src_step, (uchar*)dst, dst_step);
+}
+
+void dctFree(DftContext & c)
+{
+    if (c.useReplacement)
+    {
+        int res = cv_hal_dctFree(c.impl);
+        if (res != CV_HAL_ERROR_OK)
+        {
+            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctFree");
+        }
+        return;
+    }
+    OcvDctImpl * impl = (OcvDctImpl*)c.impl;
+    impl->free();
+    delete impl;
+    c.impl = 0;
+}
+
+} // cv::hal::
+
+} // cv::
+
 void cv::dct( InputArray _src0, OutputArray _dst, int flags )
 {
-    static DCTFunc dct_tbl[4] =
-    {
-        (DCTFunc)DCT_32f,
-        (DCTFunc)IDCT_32f,
-        (DCTFunc)DCT_64f,
-        (DCTFunc)IDCT_64f
-    };
-
-    bool inv = (flags & DCT_INVERSE) != 0;
     Mat src0 = _src0.getMat(), src = src0;
     int type = src.type(), depth = src.depth();
-    void *spec = 0;
-
-    double scale = 1.;
-    int prev_len = 0, nf = 0, stage, end_stage;
-    uchar *src_dft_buf = 0, *dst_dft_buf = 0;
-    uchar *dft_wave = 0, *dct_wave = 0;
-    int* itab = 0;
-    uchar* ptr = 0;
-    int elem_size = (int)src.elemSize(), complex_elem_size = elem_size*2;
-    int factors[34], inplace_transform;
-    int i, len, count;
-    AutoBuffer<uchar> buf;
 
     CV_Assert( type == CV_32FC1 || type == CV_64FC1 );
     _dst.create( src.rows, src.cols, type );
     Mat dst = _dst.getMat();
 
-    CV_IPP_RUN(IPP_VERSION_X100 >= 700 && src.type() == CV_32F, ippi_DCT_32f(src, dst, inv, ((flags & DCT_ROWS) != 0)))
+    int f = 0;
+    if ((flags & DFT_ROWS) != 0)
+        f |= CV_HAL_DFT_ROWS;
+    if ((flags & DCT_INVERSE) != 0)
+        f |= CV_HAL_DFT_INVERSE;
+    if (src.isContinuous() && dst.isContinuous())
+        f |= CV_HAL_DFT_IS_CONTINUOUS;
 
-    DCTFunc dct_func = dct_tbl[(int)inv + (depth == CV_64F)*2];
-
-    if( (flags & DCT_ROWS) || src.rows == 1 ||
-        (src.cols == 1 && (src.isContinuous() && dst.isContinuous())))
-    {
-        stage = end_stage = 0;
-    }
-    else
-    {
-        stage = src.cols == 1;
-        end_stage = 1;
-    }
-
-    for( ; stage <= end_stage; stage++ )
-    {
-        const uchar* sptr = src.ptr();
-        uchar* dptr = dst.ptr();
-        size_t sstep0, sstep1, dstep0, dstep1;
-
-        if( stage == 0 )
-        {
-            len = src.cols;
-            count = src.rows;
-            if( len == 1 && !(flags & DCT_ROWS) )
-            {
-                len = src.rows;
-                count = 1;
-            }
-            sstep0 = src.step;
-            dstep0 = dst.step;
-            sstep1 = dstep1 = elem_size;
-        }
-        else
-        {
-            len = dst.rows;
-            count = dst.cols;
-            sstep1 = src.step;
-            dstep1 = dst.step;
-            sstep0 = dstep0 = elem_size;
-        }
-
-        if( len != prev_len )
-        {
-            int sz;
-
-            if( len > 1 && (len & 1) )
-                CV_Error( CV_StsNotImplemented, "Odd-size DCT\'s are not implemented" );
-
-            sz = len*elem_size;
-            sz += (len/2 + 1)*complex_elem_size;
-
-            spec = 0;
-            inplace_transform = 1;
-            {
-                sz += len*(complex_elem_size + sizeof(int)) + complex_elem_size;
-
-                nf = DFTFactorize( len, factors );
-                inplace_transform = factors[0] == factors[nf-1];
-
-                i = nf > 1 && (factors[0] & 1) == 0;
-                if( (factors[i] & 1) != 0 && factors[i] > 5 )
-                    sz += (factors[i]+1)*complex_elem_size;
-
-                if( !inplace_transform )
-                    sz += len*elem_size;
-            }
-
-            buf.allocate( sz + 32 );
-            ptr = (uchar*)buf;
-
-            if( !spec )
-            {
-                dft_wave = ptr;
-                ptr += len*complex_elem_size;
-                itab = (int*)ptr;
-                ptr = (uchar*)cvAlignPtr( ptr + len*sizeof(int), 16 );
-                DFTInit( len, nf, factors, itab, complex_elem_size, dft_wave, inv );
-            }
-
-            dct_wave = ptr;
-            ptr += (len/2 + 1)*complex_elem_size;
-            src_dft_buf = dst_dft_buf = ptr;
-            ptr += len*elem_size;
-            if( !inplace_transform )
-            {
-                dst_dft_buf = ptr;
-                ptr += len*elem_size;
-            }
-            DCTInit( len, complex_elem_size, dct_wave, inv );
-            if( !inv )
-                scale += scale;
-            prev_len = len;
-        }
-        // otherwise reuse the tables calculated on the previous stage
-        for( i = 0; i < count; i++ )
-        {
-            dct_func( sptr + i*sstep0, (int)sstep1, src_dft_buf, dst_dft_buf,
-                      dptr + i*dstep0, (int)dstep1, len, nf, factors,
-                      itab, dft_wave, dct_wave, spec, ptr );
-        }
-        src = dst;
-    }
+    hal::DftContext c;
+    hal::dctInit(c, src.cols, src.rows, depth, f);
+    hal::dctRun(c, (void*)src.data, (int)src.step, (void*)dst.data, (int)dst.step);
+    hal::dctFree(c);
 }
 
 
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index 69345ca4a..d4d43332c 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -384,6 +384,31 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 #  pragma warning( pop )
 #endif
 
+inline int hal_ni_dftInit(void**, int, int, int, int, bool*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftRun(const void*, const void*, void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftFree(void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_dftInit hal_ni_dftInit
+#define cv_hal_dftRun hal_ni_dftRun
+#define cv_hal_dftFree hal_ni_dftFree
+
+inline int hal_ni_dftInit2D(void **, int, int, int, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftRun2D(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftFree2D(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_dftInit2D hal_ni_dftInit2D
+#define cv_hal_dftRun2D hal_ni_dftRun2D
+#define cv_hal_dftFree2D hal_ni_dftFree2D
+
+
+inline int hal_ni_dctInit(void **, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dctRun(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dctFree(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_dctInit hal_ni_dctInit
+#define cv_hal_dctRun hal_ni_dctRun
+#define cv_hal_dctFree hal_ni_dctFree
+
 #include "custom_hal.hpp"
 
 #endif
diff --git a/modules/core/test/test_dxt.cpp b/modules/core/test/test_dxt.cpp
index ad75e52dd..45994e0e1 100644
--- a/modules/core/test/test_dxt.cpp
+++ b/modules/core/test/test_dxt.cpp
@@ -887,3 +887,79 @@ TEST(Core_DFT, complex_output2)
         }
     }
 }
+
+class Core_DXTReverseTest : public cvtest::BaseTest
+{
+public:
+    enum Mode
+    {
+        ModeDFT,
+        ModeDCT
+    };
+    Core_DXTReverseTest(Mode m) : mode(m) {}
+private:
+    Mode mode;
+protected:
+    void run(int)
+    {
+        for (int i = 0; i < 3; ++i)
+        {
+            if (mode == ModeDCT && i != 0)
+                continue;
+            int flags = 0;
+            int flags_inv = DFT_INVERSE | DFT_SCALE;
+            int cn_in = 0;
+            int cn_out = 0;
+            switch (i)
+            {
+                case 0: cn_in = 1; cn_out = 1; break;
+                case 1: cn_in = 1; cn_out = 2; flags |= DFT_COMPLEX_OUTPUT; flags_inv |= DFT_REAL_OUTPUT; break;
+                case 2: cn_in = 2; cn_out = 2; break;
+            };
+            for (int j = 0; j < 100; ++j)
+            {
+                RNG& rng = ts->get_rng();
+                int type = rng.uniform(0, 2) ? CV_64F : CV_32F;
+                int m = rng.uniform(1, 10);
+                int n = rng.uniform(1, 10);
+                if (mode == ModeDCT)
+                {
+                    m *= 2;
+                    n *= 2;
+                }
+                Mat one(m, n, CV_MAKETYPE(type, cn_in));
+                cvtest::randUni(rng, one, Scalar::all(-1.), Scalar::all(1.));
+                Mat out;
+                Mat two;
+                if (mode == ModeDFT)
+                {
+                    cv::dft(one, out, flags);
+                    cv::dft(out, two, flags_inv);
+                }
+                else if (mode == ModeDCT)
+                {
+                    cv::dct(one, out, flags);
+                    cv::dct(out, two, flags_inv);
+                }
+                if (out.channels() != cn_out || two.channels() != cn_in || cvtest::norm(one, two, NORM_INF) > 1e-5)
+                {
+                    cout << "Test #" << j + 1 << " - "
+                        << "elements: " << m << " x " << n << ", "
+                        << "channels: "
+                        << one.channels() << " (" << cn_in << ")" << " -> "
+                        << out.channels() << " (" << cn_out << ")" << " -> "
+                        << two.channels() << " (" << cn_in << ")"
+                        << endl;
+                    cout << "signal:\n" << one << endl << endl;
+                    cout << "spectrum:\n" << out << endl << endl;
+                    cout << "inverse:\n" << two << endl << endl;
+                    ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+                    break;
+                }
+            }
+        }
+    }
+};
+
+TEST(Core_DFT, reverse) { Core_DXTReverseTest test(Core_DXTReverseTest::ModeDFT); test.safe_run(); }
+TEST(Core_DCT, reverse) { Core_DXTReverseTest test(Core_DXTReverseTest::ModeDCT); test.safe_run(); }
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 59b07032b..64742eaa2 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -632,6 +632,8 @@ static bool ipp_sqrDistance(const Mat& src, const Mat& tpl, Mat& dst)
 
 #endif
 
+#include "opencv2/core/hal/hal.hpp"
+
 void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                 Size corrsize, int ctype,
                 Point anchor, double delta, int borderType )
@@ -698,6 +700,9 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
 
     buf.resize(bufSize);
 
+    hal::DftContext c;
+    hal::dftInit2D(c, dftsize.width, dftsize.height, dftTempl.depth(), 1, 1, CV_HAL_DFT_IS_INPLACE, templ.rows);
+
     // compute DFT of each template plane
     for( k = 0; k < tcn; k++ )
     {
@@ -721,9 +726,11 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             Mat part(dst, Range(0, templ.rows), Range(templ.cols, dst.cols));
             part = Scalar::all(0);
         }
-        dft(dst, dst, 0, templ.rows);
+        hal::dftRun2D(c, dst.data, (int)dst.step, dst.data, (int)dst.step);
     }
 
+    hal::dftFree2D(c);
+
     int tileCountX = (corr.cols + blocksize.width - 1)/blocksize.width;
     int tileCountY = (corr.rows + blocksize.height - 1)/blocksize.height;
     int tileCount = tileCountX * tileCountY;
@@ -740,6 +747,16 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
     }
     borderType |= BORDER_ISOLATED;
 
+    bool useHalDft = tileCount > 1;
+    hal::DftContext cF, cR;
+    if (useHalDft)
+    {
+        int f = CV_HAL_DFT_IS_INPLACE;
+        int f_inv = f | CV_HAL_DFT_INVERSE | CV_HAL_DFT_SCALE;
+        hal::dftInit2D(cF, dftsize.width, dftsize.height, maxDepth, 1, 1, f, blocksize.height + templ.rows - 1);
+        hal::dftInit2D(cR, dftsize.width, dftsize.height, maxDepth, 1, 1, f_inv, blocksize.height);
+    }
+
     // calculate correlation by blocks
     for( i = 0; i < tileCount; i++ )
     {
@@ -777,11 +794,19 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                 copyMakeBorder(dst1, dst, y1-y0, dst.rows-dst1.rows-(y1-y0),
                                x1-x0, dst.cols-dst1.cols-(x1-x0), borderType);
 
-            dft( dftImg, dftImg, 0, dsz.height );
+            if (useHalDft && bsz.height == blocksize.height)
+                hal::dftRun2D(cF, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+            else
+                dft( dftImg, dftImg, 0, dsz.height );
+
             Mat dftTempl1(dftTempl, Rect(0, tcn > 1 ? k*dftsize.height : 0,
                                          dftsize.width, dftsize.height));
             mulSpectrums(dftImg, dftTempl1, dftImg, 0, true);
-            dft( dftImg, dftImg, DFT_INVERSE + DFT_SCALE, bsz.height );
+
+            if (useHalDft && bsz.height == blocksize.height)
+                hal::dftRun2D(cR, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+            else
+                dft( dftImg, dftImg, DFT_INVERSE + DFT_SCALE, bsz.height );
 
             src = dftImg(Rect(0, 0, bsz.width, bsz.height));
 
@@ -813,6 +838,11 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             }
         }
     }
+    if (useHalDft)
+    {
+        hal::dftFree2D(cF);
+        hal::dftFree2D(cR);
+    }
 }
 
 static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _result, int method, InputArray _mask )

From 15783cf668b989617f56c40aaf1ceed9e118086d Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Thu, 4 Feb 2016 19:29:23 +0300
Subject: [PATCH 3/7] Always use hal::dft in crossCorr function

---
 modules/imgproc/src/templmatch.cpp | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 64742eaa2..6353f14ff 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -747,15 +747,11 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
     }
     borderType |= BORDER_ISOLATED;
 
-    bool useHalDft = tileCount > 1;
     hal::DftContext cF, cR;
-    if (useHalDft)
-    {
-        int f = CV_HAL_DFT_IS_INPLACE;
-        int f_inv = f | CV_HAL_DFT_INVERSE | CV_HAL_DFT_SCALE;
-        hal::dftInit2D(cF, dftsize.width, dftsize.height, maxDepth, 1, 1, f, blocksize.height + templ.rows - 1);
-        hal::dftInit2D(cR, dftsize.width, dftsize.height, maxDepth, 1, 1, f_inv, blocksize.height);
-    }
+    int f = CV_HAL_DFT_IS_INPLACE;
+    int f_inv = f | CV_HAL_DFT_INVERSE | CV_HAL_DFT_SCALE;
+    hal::dftInit2D(cF, dftsize.width, dftsize.height, maxDepth, 1, 1, f, blocksize.height + templ.rows - 1);
+    hal::dftInit2D(cR, dftsize.width, dftsize.height, maxDepth, 1, 1, f_inv, blocksize.height);
 
     // calculate correlation by blocks
     for( i = 0; i < tileCount; i++ )
@@ -794,7 +790,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                 copyMakeBorder(dst1, dst, y1-y0, dst.rows-dst1.rows-(y1-y0),
                                x1-x0, dst.cols-dst1.cols-(x1-x0), borderType);
 
-            if (useHalDft && bsz.height == blocksize.height)
+            if (bsz.height == blocksize.height)
                 hal::dftRun2D(cF, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
             else
                 dft( dftImg, dftImg, 0, dsz.height );
@@ -803,7 +799,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                                          dftsize.width, dftsize.height));
             mulSpectrums(dftImg, dftTempl1, dftImg, 0, true);
 
-            if (useHalDft && bsz.height == blocksize.height)
+            if (bsz.height == blocksize.height)
                 hal::dftRun2D(cR, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
             else
                 dft( dftImg, dftImg, DFT_INVERSE + DFT_SCALE, bsz.height );
@@ -838,11 +834,8 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             }
         }
     }
-    if (useHalDft)
-    {
-        hal::dftFree2D(cF);
-        hal::dftFree2D(cR);
-    }
+    hal::dftFree2D(cF);
+    hal::dftFree2D(cR);
 }
 
 static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _result, int method, InputArray _mask )

From f40d701427d0c9aed8dc78588401bdd36f35ea91 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Fri, 5 Feb 2016 11:40:40 +0300
Subject: [PATCH 4/7] DFT: renamed HAL functions

---
 modules/core/include/opencv2/core/hal/hal.hpp | 14 ++---
 modules/core/src/dxt.cpp                      | 62 +++++++++----------
 modules/core/src/hal_replacement.hpp          | 28 ++++-----
 modules/imgproc/src/templmatch.cpp            |  6 +-
 4 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 52a5f99b3..6b9f93dbf 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -195,16 +195,16 @@ struct DftContext
 };
 
 CV_EXPORTS void dftInit2D(DftContext & c, int _width, int _height, int _depth, int _src_channels, int _dst_channels, int flags, int _nonzero_rows = 0);
-CV_EXPORTS void dftRun2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
+CV_EXPORTS void dft2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
 CV_EXPORTS void dftFree2D(DftContext & c);
 
-CV_EXPORTS void dftInit(DftContext & c, int len, int count, int depth, int flags, bool * useBuffer = 0);
-CV_EXPORTS void dftRun(const DftContext & c, const void * src, void * dst);
-CV_EXPORTS void dftFree(DftContext & c);
+CV_EXPORTS void dftInit1D(DftContext & c, int len, int count, int depth, int flags, bool * useBuffer = 0);
+CV_EXPORTS void dft1D(const DftContext & c, const void * src, void * dst);
+CV_EXPORTS void dftFree1D(DftContext & c);
 
-CV_EXPORTS void dctInit(DftContext & c, int width, int height, int depth, int flags);
-CV_EXPORTS void dctRun(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
-CV_EXPORTS void dctFree(DftContext & c);
+CV_EXPORTS void dctInit2D(DftContext & c, int width, int height, int depth, int flags);
+CV_EXPORTS void dct2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
+CV_EXPORTS void dctFree2D(DftContext & c);
 
 //! @} core_hal
 
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index 1265091bc..1ea549675 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -2763,7 +2763,7 @@ public:
                     count = height;
                 }
                 needBufferA = isInplace;
-                hal::dftInit(contextA, len, count, depth, f, &needBufferA);
+                hal::dftInit1D(contextA, len, count, depth, f, &needBufferA);
                 if (needBufferA)
                     tmp_bufA.allocate(len * complex_elem_size);
             }
@@ -2773,7 +2773,7 @@ public:
                 count = width;
                 f |= CV_HAL_DFT_STAGE_COLS;
                 needBufferB = isInplace;
-                hal::dftInit(contextB, len, count, depth, f, &needBufferB);
+                hal::dftInit1D(contextB, len, count, depth, f, &needBufferB);
                 if (needBufferB)
                     tmp_bufB.allocate(len * complex_elem_size);
 
@@ -2864,8 +2864,8 @@ public:
     {
         if (useIpp)
             return;
-        hal::dftFree(contextA);
-        hal::dftFree(contextB);
+        hal::dftFree1D(contextA);
+        hal::dftFree1D(contextB);
     }
 
 protected:
@@ -2909,7 +2909,7 @@ protected:
             if( needBufferA )
                 dptr = tmp_bufA;
 
-            hal::dftRun(contextA, sptr, dptr);
+            hal::dft1D(contextA, sptr, dptr);
 
             if( needBufferA )
                 memcpy( dptr0, dptr + dptr_offset, dst_full_len );
@@ -2983,8 +2983,8 @@ protected:
             }
 
             if( even )
-                hal::dftRun(contextB, buf1, dbuf1);
-            hal::dftRun(contextB, buf0, dbuf0);
+                hal::dft1D(contextB, buf1, dbuf1);
+            hal::dft1D(contextB, buf0, dbuf0);
 
             if( stage_dst_channels == 1 )
             {
@@ -3032,12 +3032,12 @@ protected:
             if( i+1 < b )
             {
                 CopyFrom2Columns( sptr0, src_step, buf0, buf1, len, complex_elem_size );
-                hal::dftRun(contextB, buf1, dbuf1);
+                hal::dft1D(contextB, buf1, dbuf1);
             }
             else
                 CopyColumn( sptr0, src_step, buf0, complex_elem_size, len, complex_elem_size );
 
-            hal::dftRun(contextB, buf0, dbuf0);
+            hal::dft1D(contextB, buf0, dbuf0);
 
             if( i+1 < b )
                 CopyTo2Columns( dbuf0, dbuf1, dptr0, dst_step, len, complex_elem_size );
@@ -3223,9 +3223,9 @@ namespace hal {
 
 //================== 1D ======================
 
-void dftInit(DftContext & context, int len, int count, int depth, int flags, bool *needBuffer)
+void dftInit1D(DftContext & context, int len, int count, int depth, int flags, bool *needBuffer)
 {
-    int res = cv_hal_dftInit(&context.impl, len, count, depth, flags, needBuffer);
+    int res = cv_hal_dftInit1D(&context.impl, len, count, depth, flags, needBuffer);
     if (res == CV_HAL_ERROR_OK)
     {
         context.useReplacement = true;
@@ -3242,11 +3242,11 @@ void dftInit(DftContext & context, int len, int count, int depth, int flags, boo
     c->init(len, count, depth, flags, needBuffer);
 }
 
-void dftRun(const DftContext & context, const void * src, void * dst)
+void dft1D(const DftContext & context, const void * src, void * dst)
 {
     if (context.useReplacement)
     {
-        int res = cv_hal_dftRun(context.impl, src, dst);
+        int res = cv_hal_dft1D(context.impl, src, dst);
         if (res != CV_HAL_ERROR_OK)
         {
             CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun");
@@ -3257,11 +3257,11 @@ void dftRun(const DftContext & context, const void * src, void * dst)
     c->run(src, dst);
 }
 
-void dftFree(DftContext & context)
+void dftFree1D(DftContext & context)
 {
     if (context.useReplacement)
     {
-        int res = cv_hal_dftFree(context.impl);
+        int res = cv_hal_dftFree1D(context.impl);
         if (res != CV_HAL_ERROR_OK)
         {
             CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftFree");
@@ -3282,9 +3282,9 @@ void dftFree(DftContext & context)
 //================== 2D ======================
 
 void dftInit2D(DftContext & c,
-               int _width, int _height, int _depth, int _src_channels, int _dst_channels,
-               int flags,
-               int _nonzero_rows)
+             int _width, int _height, int _depth, int _src_channels, int _dst_channels,
+             int flags,
+             int _nonzero_rows)
 {
     int res = cv_hal_dftInit2D(&c.impl, _width, _height, _depth, _src_channels, _dst_channels, flags, _nonzero_rows);
     if (res == CV_HAL_ERROR_OK)
@@ -3304,12 +3304,12 @@ void dftInit2D(DftContext & c,
     c.impl = (void*)d;
 }
 
-void dftRun2D(const DftContext & c,
-              const void * src, int src_step, void * dst, int dst_step)
+void dft2D(const DftContext & c,
+         const void * src, int src_step, void * dst, int dst_step)
 {
     if (c.useReplacement)
     {
-        int res = cv_hal_dftRun2D(c.impl, (uchar*)src, src_step, (uchar*)dst, dst_step);
+        int res = cv_hal_dft2D(c.impl, (uchar*)src, src_step, (uchar*)dst, dst_step);
         if (res != CV_HAL_ERROR_OK)
         {
             CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun2D");
@@ -3384,7 +3384,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
         f |= CV_HAL_DFT_IS_INPLACE;
     hal::DftContext c;
     hal::dftInit2D(c, src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);
-    hal::dftRun2D(c, src.data, (int)src.step, dst.data, (int)dst.step);
+    hal::dft2D(c, src.data, (int)src.step, dst.data, (int)dst.step);
     hal::dftFree2D(c);
 }
 
@@ -4198,9 +4198,9 @@ public:
 
 namespace hal {
 
-void dctInit(DftContext & c, int width, int height, int depth, int flags)
+void dctInit2D(DftContext & c, int width, int height, int depth, int flags)
 {
-    int res = cv_hal_dctInit(&c.impl, width, height, depth, flags);
+    int res = cv_hal_dctInit2D(&c.impl, width, height, depth, flags);
     if (res == CV_HAL_ERROR_OK)
     {
         c.useReplacement = true;
@@ -4212,11 +4212,11 @@ void dctInit(DftContext & c, int width, int height, int depth, int flags)
     c.impl = impl;
 }
 
-void dctRun(const DftContext & c, const void * src, int src_step, void * dst, int dst_step)
+void dct2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step)
 {
     if (c.useReplacement)
     {
-        int res = cv_hal_dctRun(c.impl, src, src_step, dst, dst_step);
+        int res = cv_hal_dct2D(c.impl, src, src_step, dst, dst_step);
         if (res != CV_HAL_ERROR_OK)
         {
             CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctRun");
@@ -4227,11 +4227,11 @@ void dctRun(const DftContext & c, const void * src, int src_step, void * dst, in
     impl->run((uchar*)src, src_step, (uchar*)dst, dst_step);
 }
 
-void dctFree(DftContext & c)
+void dctFree2D(DftContext & c)
 {
     if (c.useReplacement)
     {
-        int res = cv_hal_dctFree(c.impl);
+        int res = cv_hal_dctFree2D(c.impl);
         if (res != CV_HAL_ERROR_OK)
         {
             CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctFree");
@@ -4266,9 +4266,9 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
         f |= CV_HAL_DFT_IS_CONTINUOUS;
 
     hal::DftContext c;
-    hal::dctInit(c, src.cols, src.rows, depth, f);
-    hal::dctRun(c, (void*)src.data, (int)src.step, (void*)dst.data, (int)dst.step);
-    hal::dctFree(c);
+    hal::dctInit2D(c, src.cols, src.rows, depth, f);
+    hal::dct2D(c, (void*)src.data, (int)src.step, (void*)dst.data, (int)dst.step);
+    hal::dctFree2D(c);
 }
 
 
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index d4d43332c..bbf32f39d 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -384,30 +384,30 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 #  pragma warning( pop )
 #endif
 
-inline int hal_ni_dftInit(void**, int, int, int, int, bool*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dftRun(const void*, const void*, void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dftFree(void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftInit1D(void**, int, int, int, int, bool*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dft1D(const void*, const void*, void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dftFree1D(void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
-#define cv_hal_dftInit hal_ni_dftInit
-#define cv_hal_dftRun hal_ni_dftRun
-#define cv_hal_dftFree hal_ni_dftFree
+#define cv_hal_dftInit1D hal_ni_dftInit1D
+#define cv_hal_dft1D hal_ni_dft1D
+#define cv_hal_dftFree1D hal_ni_dftFree1D
 
 inline int hal_ni_dftInit2D(void **, int, int, int, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dftRun2D(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dft2D(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_dftFree2D(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
 #define cv_hal_dftInit2D hal_ni_dftInit2D
-#define cv_hal_dftRun2D hal_ni_dftRun2D
+#define cv_hal_dft2D hal_ni_dft2D
 #define cv_hal_dftFree2D hal_ni_dftFree2D
 
 
-inline int hal_ni_dctInit(void **, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dctRun(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dctFree(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dctInit2D(void **, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dct2D(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_dctFree2D(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
-#define cv_hal_dctInit hal_ni_dctInit
-#define cv_hal_dctRun hal_ni_dctRun
-#define cv_hal_dctFree hal_ni_dctFree
+#define cv_hal_dctInit2D hal_ni_dctInit2D
+#define cv_hal_dct2D hal_ni_dct2D
+#define cv_hal_dctFree2D hal_ni_dctFree2D
 
 #include "custom_hal.hpp"
 
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 6353f14ff..4e8958279 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -726,7 +726,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             Mat part(dst, Range(0, templ.rows), Range(templ.cols, dst.cols));
             part = Scalar::all(0);
         }
-        hal::dftRun2D(c, dst.data, (int)dst.step, dst.data, (int)dst.step);
+        hal::dft2D(c, dst.data, (int)dst.step, dst.data, (int)dst.step);
     }
 
     hal::dftFree2D(c);
@@ -791,7 +791,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                                x1-x0, dst.cols-dst1.cols-(x1-x0), borderType);
 
             if (bsz.height == blocksize.height)
-                hal::dftRun2D(cF, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+                hal::dft2D(cF, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
             else
                 dft( dftImg, dftImg, 0, dsz.height );
 
@@ -800,7 +800,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             mulSpectrums(dftImg, dftTempl1, dftImg, 0, true);
 
             if (bsz.height == blocksize.height)
-                hal::dftRun2D(cR, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+                hal::dft2D(cR, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
             else
                 dft( dftImg, dftImg, DFT_INVERSE + DFT_SCALE, bsz.height );
 

From 233612efd7925bd022777d297fdf65215f16dcc8 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Fri, 8 Apr 2016 16:03:51 +0300
Subject: [PATCH 5/7] Reworked HAL dft/dct interface, added replacement
 documentation

---
 modules/core/include/opencv2/core/hal/hal.hpp |  32 +-
 .../core/include/opencv2/core/hal/interface.h |  25 +-
 modules/core/src/dxt.cpp                      | 354 ++++++++----------
 modules/core/src/hal_replacement.hpp          | 121 ++++--
 modules/imgproc/src/templmatch.cpp            |  19 +-
 5 files changed, 295 insertions(+), 256 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 6b9f93dbf..5b01cbe4c 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -187,24 +187,28 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
 
-struct DftContext
+struct CV_EXPORTS DFT1D
 {
-    void * impl;
-    bool useReplacement;
-    DftContext() : impl(0), useReplacement(false) {}
+    static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);
+    virtual void apply(const uchar *src, uchar *dst) = 0;
+    virtual ~DFT1D() {}
 };
 
-CV_EXPORTS void dftInit2D(DftContext & c, int _width, int _height, int _depth, int _src_channels, int _dst_channels, int flags, int _nonzero_rows = 0);
-CV_EXPORTS void dft2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
-CV_EXPORTS void dftFree2D(DftContext & c);
+struct CV_EXPORTS DFT2D
+{
+    static Ptr<DFT2D> create(int width, int height, int depth,
+                             int src_channels, int dst_channels,
+                             int flags, int nonzero_rows = 0);
+    virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
+    virtual ~DFT2D() {}
+};
 
-CV_EXPORTS void dftInit1D(DftContext & c, int len, int count, int depth, int flags, bool * useBuffer = 0);
-CV_EXPORTS void dft1D(const DftContext & c, const void * src, void * dst);
-CV_EXPORTS void dftFree1D(DftContext & c);
-
-CV_EXPORTS void dctInit2D(DftContext & c, int width, int height, int depth, int flags);
-CV_EXPORTS void dct2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step);
-CV_EXPORTS void dctFree2D(DftContext & c);
+struct CV_EXPORTS DCT2D
+{
+    static Ptr<DCT2D> create(int width, int height, int depth, int flags);
+    virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
+    virtual ~DCT2D() {}
+};
 
 //! @} core_hal
 
diff --git a/modules/core/include/opencv2/core/hal/interface.h b/modules/core/include/opencv2/core/hal/interface.h
index 0da68f18c..2bb7b19f2 100644
--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -11,21 +11,11 @@
 #define CV_HAL_ERROR_UNKNOWN -1
 //! @}
 
-
-#define CV_HAL_DFT_INVERSE        1
-#define CV_HAL_DFT_SCALE          2
-#define CV_HAL_DFT_ROWS           4
-#define CV_HAL_DFT_COMPLEX_OUTPUT 16
-#define CV_HAL_DFT_REAL_OUTPUT    32
-#define CV_HAL_DFT_TWO_STAGE      64
-#define CV_HAL_DFT_STAGE_COLS    128
-#define CV_HAL_DFT_IS_CONTINUOUS 512
-#define CV_HAL_DFT_IS_INPLACE 1024
-
 #ifdef __cplusplus
 #include <cstddef>
 #else
 #include <stddef.h>
+#include <stdbool.h>
 #endif
 
 //! @name Data types
@@ -155,6 +145,19 @@ typedef signed char schar;
 #define CV_HAL_BORDER_ISOLATED 16
 //! @}
 
+//! @name DFT flags
+//! @{
+#define CV_HAL_DFT_INVERSE        1
+#define CV_HAL_DFT_SCALE          2
+#define CV_HAL_DFT_ROWS           4
+#define CV_HAL_DFT_COMPLEX_OUTPUT 16
+#define CV_HAL_DFT_REAL_OUTPUT    32
+#define CV_HAL_DFT_TWO_STAGE      64
+#define CV_HAL_DFT_STAGE_COLS    128
+#define CV_HAL_DFT_IS_CONTINUOUS 512
+#define CV_HAL_DFT_IS_INPLACE 1024
+//! @}
+
 //! @}
 
 #endif
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index 1ea549675..2cff51d5a 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -1553,7 +1553,7 @@ class Dft_C_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_C_IPPLoop_Invoker(uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+    Dft_C_IPPLoop_Invoker(const uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
                           const Dft& _ippidft, int _norm_flag, bool *_ok) :
         ParallelLoopBody(),
         src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
@@ -1617,7 +1617,7 @@ public:
     }
 
 private:
-    uchar * src;
+    const uchar * src;
     int src_step;
     uchar * dst;
     int dst_step;
@@ -1634,7 +1634,7 @@ class Dft_R_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_R_IPPLoop_Invoker(uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+    Dft_R_IPPLoop_Invoker(const uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
                           const Dft& _ippidft, int _norm_flag, bool *_ok) :
         ParallelLoopBody(),
         src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
@@ -1698,7 +1698,7 @@ public:
     }
 
 private:
-    uchar * src;
+    const uchar * src;
     int src_step;
     uchar * dst;
     int dst_step;
@@ -1711,7 +1711,7 @@ private:
 };
 
 template <typename Dft>
-bool Dft_C_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
+bool Dft_C_IPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
     parallel_for_(Range(0, height), Dft_C_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
@@ -1719,7 +1719,7 @@ bool Dft_C_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int wid
 }
 
 template <typename Dft>
-bool Dft_R_IPPLoop(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
+bool Dft_R_IPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
     parallel_for_(Range(0, height), Dft_R_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
@@ -1750,7 +1750,7 @@ private:
     ippiDFT_R_Func func;
 };
 
-static bool ippi_DFT_C_32F(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
+static bool ippi_DFT_C_32F(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -1804,7 +1804,7 @@ static bool ippi_DFT_C_32F(uchar * src, int src_step, uchar * dst, int dst_step,
     return false;
 }
 
-static bool ippi_DFT_R_32F(uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
+static bool ippi_DFT_R_32F(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -2611,11 +2611,11 @@ inline DftDims determineDims(int rows, int cols, bool isRowWise, bool isContinuo
     return InvalidDim;
 }
 
-class OcvDftImpl
+class OcvDftImpl : public hal::DFT2D
 {
 protected:
-    hal::DftContext contextA;
-    hal::DftContext contextB;
+    Ptr<hal::DFT1D> contextA;
+    Ptr<hal::DFT1D> contextB;
     bool needBufferA;
     bool needBufferB;
     bool inv;
@@ -2763,7 +2763,7 @@ public:
                     count = height;
                 }
                 needBufferA = isInplace;
-                hal::dftInit1D(contextA, len, count, depth, f, &needBufferA);
+                contextA = hal::DFT1D::create(len, count, depth, f, &needBufferA);
                 if (needBufferA)
                     tmp_bufA.allocate(len * complex_elem_size);
             }
@@ -2773,7 +2773,7 @@ public:
                 count = width;
                 f |= CV_HAL_DFT_STAGE_COLS;
                 needBufferB = isInplace;
-                hal::dftInit1D(contextB, len, count, depth, f, &needBufferB);
+                contextB = hal::DFT1D::create(len, count, depth, f, &needBufferB);
                 if (needBufferB)
                     tmp_bufB.allocate(len * complex_elem_size);
 
@@ -2783,7 +2783,7 @@ public:
         }
     }
 
-    void run(uchar * src, int src_step, uchar * dst, int dst_step)
+    void apply(const uchar * src, size_t src_step, uchar * dst, size_t dst_step)
     {
 #if defined USE_IPP_DFT
         if (useIpp)
@@ -2860,17 +2860,9 @@ public:
         }
     }
 
-    void free()
-    {
-        if (useIpp)
-            return;
-        hal::dftFree1D(contextA);
-        hal::dftFree1D(contextB);
-    }
-
 protected:
 
-    void rowDft(uchar* src_data, int src_step, uchar* dst_data, int dst_step, bool isComplex, bool isLastStage)
+    void rowDft(const uchar* src_data, int src_step, uchar* dst_data, int dst_step, bool isComplex, bool isLastStage)
     {
         int len, count;
         if (width == 1 && !isRowTransform )
@@ -2909,7 +2901,7 @@ protected:
             if( needBufferA )
                 dptr = tmp_bufA;
 
-            hal::dft1D(contextA, sptr, dptr);
+            contextA->apply(sptr, dptr);
 
             if( needBufferA )
                 memcpy( dptr0, dptr + dptr_offset, dst_full_len );
@@ -2924,7 +2916,7 @@ protected:
             complementComplexOutput(depth, dst_data, dst_step, len, nz, 1);
     }
 
-    void colDft(uchar* src_data, int src_step, uchar* dst_data, int dst_step, int stage_src_channels, int stage_dst_channels, bool isLastStage)
+    void colDft(const uchar* src_data, int src_step, uchar* dst_data, int dst_step, int stage_src_channels, int stage_dst_channels, bool isLastStage)
     {
         int len = height;
         int count = width;
@@ -2983,8 +2975,8 @@ protected:
             }
 
             if( even )
-                hal::dft1D(contextB, buf1, dbuf1);
-            hal::dft1D(contextB, buf0, dbuf0);
+                contextB->apply(buf1, dbuf1);
+            contextB->apply(buf0, dbuf0);
 
             if( stage_dst_channels == 1 )
             {
@@ -3032,12 +3024,12 @@ protected:
             if( i+1 < b )
             {
                 CopyFrom2Columns( sptr0, src_step, buf0, buf1, len, complex_elem_size );
-                hal::dft1D(contextB, buf1, dbuf1);
+                contextB->apply(buf1, dbuf1);
             }
             else
                 CopyColumn( sptr0, src_step, buf0, complex_elem_size, len, complex_elem_size );
 
-            hal::dft1D(contextB, buf0, dbuf0);
+            contextB->apply(buf0, dbuf0);
 
             if( i+1 < b )
                 CopyTo2Columns( dbuf0, dbuf1, dptr0, dst_step, len, complex_elem_size );
@@ -3051,7 +3043,7 @@ protected:
     }
 };
 
-class OcvDftBasicImpl
+class OcvDftBasicImpl : public hal::DFT1D
 {
 public:
     OcvDftOptions opt;
@@ -3068,11 +3060,6 @@ public:
     {
         opt.factors = _factors;
     }
-    OcvDftBasicImpl & operator=(const OcvDftBasicImpl & other)
-    {
-        this->opt = other.opt;
-        return *this;
-    }
     void init(int len, int count, int depth, int flags, bool *needBuffer)
     {
         int prev_len = opt.n;
@@ -3211,7 +3198,7 @@ public:
         }
     }
 
-    void run(const void * src, void * dst)
+    void apply(const uchar *src, uchar *dst)
     {
         opt.dft_func(opt, src, dst);
     }
@@ -3219,126 +3206,113 @@ public:
     void free() {}
 };
 
+struct ReplacementDFT1D : public hal::DFT1D
+{
+    cvhalDFT *context;
+    bool isInitialized;
+
+    ReplacementDFT1D() : context(0), isInitialized(false) {}
+    bool init(int len, int count, int depth, int flags, bool *needBuffer)
+    {
+        int res = cv_hal_dftInit1D(&context, len, count, depth, flags, needBuffer);
+        isInitialized = (res == CV_HAL_ERROR_OK);
+        return isInitialized;
+    }
+    void apply(const uchar *src, uchar *dst)
+    {
+        if (isInitialized)
+        {
+            CALL_HAL(dft1D, cv_hal_dft1D, context, src, dst);
+        }
+    }
+    ~ReplacementDFT1D()
+    {
+        if (isInitialized)
+        {
+            CALL_HAL(dftFree1D, cv_hal_dftFree1D, context);
+        }
+    }
+};
+
+struct ReplacementDFT2D : public hal::DFT2D
+{
+    cvhalDFT *context;
+    bool isInitialized;
+
+    ReplacementDFT2D() : context(0), isInitialized(false) {}
+    bool init(int width, int height, int depth,
+              int src_channels, int dst_channels,
+              int flags, int nonzero_rows)
+    {
+        int res = cv_hal_dftInit2D(&context, width, height, depth, src_channels, dst_channels, flags, nonzero_rows);
+        isInitialized = (res == CV_HAL_ERROR_OK);
+        return isInitialized;
+    }
+    void apply(const uchar *src, size_t src_step, uchar *dst, size_t dst_step)
+    {
+        if (isInitialized)
+        {
+            CALL_HAL(dft2D, cv_hal_dft2D, context, src, src_step, dst, dst_step);
+        }
+    }
+    ~ReplacementDFT2D()
+    {
+        if (isInitialized)
+        {
+            CALL_HAL(dftFree2D, cv_hal_dftFree1D, context);
+        }
+    }
+};
+
 namespace hal {
 
 //================== 1D ======================
 
-void dftInit1D(DftContext & context, int len, int count, int depth, int flags, bool *needBuffer)
+Ptr<DFT1D> DFT1D::create(int len, int count, int depth, int flags, bool *needBuffer)
 {
-    int res = cv_hal_dftInit1D(&context.impl, len, count, depth, flags, needBuffer);
-    if (res == CV_HAL_ERROR_OK)
     {
-        context.useReplacement = true;
-        return;
-    }
-
-    context.useReplacement = false;
-    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
-    if (!c)
-    {
-        c = new OcvDftBasicImpl();
-        context.impl = (void*)c;
-    }
-    c->init(len, count, depth, flags, needBuffer);
-}
-
-void dft1D(const DftContext & context, const void * src, void * dst)
-{
-    if (context.useReplacement)
-    {
-        int res = cv_hal_dft1D(context.impl, src, dst);
-        if (res != CV_HAL_ERROR_OK)
+        ReplacementDFT1D *impl = new ReplacementDFT1D();
+        if (impl->init(len, count, depth, flags, needBuffer))
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun");
+            return Ptr<DFT1D>(impl);
         }
-        return;
+        delete impl;
     }
-    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
-    c->run(src, dst);
-}
-
-void dftFree1D(DftContext & context)
-{
-    if (context.useReplacement)
     {
-        int res = cv_hal_dftFree1D(context.impl);
-        if (res != CV_HAL_ERROR_OK)
-        {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftFree");
-        }
-        return;
-    }
-
-    OcvDftBasicImpl * c = (OcvDftBasicImpl*)context.impl;
-    if (c)
-    {
-        c->free();
-        delete c;
-        context.impl = 0;
+        OcvDftBasicImpl *impl = new OcvDftBasicImpl();
+        impl->init(len, count, depth, flags, needBuffer);
+        return Ptr<DFT1D>(impl);
     }
 }
 
-
 //================== 2D ======================
 
-void dftInit2D(DftContext & c,
-             int _width, int _height, int _depth, int _src_channels, int _dst_channels,
-             int flags,
-             int _nonzero_rows)
+Ptr<DFT2D> DFT2D::create(int width, int height, int depth,
+                         int src_channels, int dst_channels,
+                         int flags, int nonzero_rows)
 {
-    int res = cv_hal_dftInit2D(&c.impl, _width, _height, _depth, _src_channels, _dst_channels, flags, _nonzero_rows);
-    if (res == CV_HAL_ERROR_OK)
     {
-        c.useReplacement = true;
-        return;
-    }
-    c.useReplacement = false;
-
-    if( _width == 1 && _nonzero_rows > 0 )
-        CV_Error( CV_StsNotImplemented,
-        "This mode (using nonzero_rows with a single-column matrix) breaks the function's logic, so it is prohibited.\n"
-        "For fast convolution/correlation use 2-column matrix or single-row matrix instead" );
-
-    OcvDftImpl * d = new OcvDftImpl();
-    d->init(_width, _height, _depth, _src_channels, _dst_channels, flags, _nonzero_rows);
-    c.impl = (void*)d;
-}
-
-void dft2D(const DftContext & c,
-         const void * src, int src_step, void * dst, int dst_step)
-{
-    if (c.useReplacement)
-    {
-        int res = cv_hal_dft2D(c.impl, (uchar*)src, src_step, (uchar*)dst, dst_step);
-        if (res != CV_HAL_ERROR_OK)
+        ReplacementDFT2D *impl = new ReplacementDFT2D();
+        if (impl->init(width, height, depth, src_channels, dst_channels, flags, nonzero_rows))
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftRun2D");
+            return Ptr<DFT2D>(impl);
         }
-        return;
+        delete impl;
     }
-    OcvDftImpl * d = (OcvDftImpl*)c.impl;
-    d->run((uchar*)src, src_step, (uchar*)dst, dst_step);
-}
-
-void dftFree2D(DftContext & c)
-{
-    if (c.useReplacement)
     {
-        int res = cv_hal_dftFree2D(c.impl);
-        if (res != CV_HAL_ERROR_OK)
+        if(width == 1 && nonzero_rows > 0 )
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dftFree2D");
+            CV_Error( CV_StsNotImplemented,
+            "This mode (using nonzero_rows with a single-column matrix) breaks the function's logic, so it is prohibited.\n"
+            "For fast convolution/correlation use 2-column matrix or single-row matrix instead" );
         }
-        return;
+        OcvDftImpl *impl = new OcvDftImpl();
+        impl->init(width, height, depth, src_channels, dst_channels, flags, nonzero_rows);
+        return Ptr<DFT2D>(impl);
     }
-    OcvDftImpl * d = (OcvDftImpl*)c.impl;
-    d->free();
-    delete d;
-    c.impl = 0;
 }
 
 } // cv::hal::
-
 } // cv::
 
 
@@ -3382,10 +3356,8 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
         f |= CV_HAL_DFT_SCALE;
     if (src.data == dst.data)
         f |= CV_HAL_DFT_IS_INPLACE;
-    hal::DftContext c;
-    hal::dftInit2D(c, src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);
-    hal::dft2D(c, src.data, (int)src.step, dst.data, (int)dst.step);
-    hal::dftFree2D(c);
+    Ptr<hal::DFT2D> c = hal::DFT2D::create(src.cols, src.rows, depth, src.channels(), dst.channels(), f, nonzero_rows);
+    c->apply(src.data, src.step, dst.data, dst.step);
 }
 
 
@@ -3607,7 +3579,7 @@ namespace cv
    http://www.ece.utexas.edu/~bevans/courses/ee381k/lectures/09_DCT/lecture9/:
 */
 template<typename T> static void
-DCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
+DCT( const OcvDftOptions & c, const T* src, size_t src_step, T* dft_src, T* dft_dst, T* dst, size_t dst_step,
      const Complex<T>* dct_wave )
 {
     static const T sin_45 = (T)0.70710678118654752440084436210485;
@@ -3650,7 +3622,7 @@ DCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst
 
 
 template<typename T> static void
-IDCT( const OcvDftOptions & c, const T* src, int src_step, T* dft_src, T* dft_dst, T* dst, int dst_step,
+IDCT( const OcvDftOptions & c, const T* src, size_t src_step, T* dft_src, T* dft_dst, T* dst, size_t dst_step,
       const Complex<T>* dct_wave)
 {
     static const T sin_45 = (T)0.70710678118654752440084436210485;
@@ -3768,29 +3740,29 @@ DCTInit( int n, int elem_size, void* _wave, int inv )
 }
 
 
-typedef void (*DCTFunc)(const OcvDftOptions & c, const void* src, int src_step, void* dft_src,
-                        void* dft_dst, void* dst, int dst_step, const void* dct_wave);
+typedef void (*DCTFunc)(const OcvDftOptions & c, const void* src, size_t src_step, void* dft_src,
+                        void* dft_dst, void* dst, size_t dst_step, const void* dct_wave);
 
-static void DCT_32f(const OcvDftOptions & c, const float* src, int src_step, float* dft_src, float* dft_dst,
-                    float* dst, int dst_step, const Complexf* dct_wave)
+static void DCT_32f(const OcvDftOptions & c, const float* src, size_t src_step, float* dft_src, float* dft_dst,
+                    float* dst, size_t dst_step, const Complexf* dct_wave)
 {
     DCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void IDCT_32f(const OcvDftOptions & c, const float* src, int src_step, float* dft_src, float* dft_dst,
-                    float* dst, int dst_step, const Complexf* dct_wave)
+static void IDCT_32f(const OcvDftOptions & c, const float* src, size_t src_step, float* dft_src, float* dft_dst,
+                    float* dst, size_t dst_step, const Complexf* dct_wave)
 {
     IDCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void DCT_64f(const OcvDftOptions & c, const double* src, int src_step, double* dft_src, double* dft_dst,
-                    double* dst, int dst_step, const Complexd* dct_wave)
+static void DCT_64f(const OcvDftOptions & c, const double* src, size_t src_step, double* dft_src, double* dft_dst,
+                    double* dst, size_t dst_step, const Complexd* dct_wave)
 {
     DCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
 
-static void IDCT_64f(const OcvDftOptions & c, const double* src, int src_step, double* dft_src, double* dft_dst,
-                     double* dst, int dst_step, const Complexd* dct_wave)
+static void IDCT_64f(const OcvDftOptions & c, const double* src, size_t src_step, double* dft_src, double* dft_dst,
+                     double* dst, size_t dst_step, const Complexd* dct_wave)
 {
     IDCT(c, src, src_step, dft_src, dft_dst, dst, dst_step, dct_wave);
 }
@@ -4058,7 +4030,7 @@ static bool ippi_DCT_32f(const uchar * src, int src_step, uchar * dst, int dst_s
 
 namespace cv {
 
-class OcvDctImpl
+class OcvDctImpl : public hal::DCT2D
 {
 public:
     OcvDftOptions opt;
@@ -4110,7 +4082,7 @@ public:
             end_stage = 1;
         }
     }
-    void run(uchar * src, int src_step, uchar * dst, int dst_step)
+    void apply(const uchar *src, size_t src_step, uchar *dst, size_t dst_step)
     {
         CV_IPP_RUN(IPP_VERSION_X100 >= 700 && depth == CV_32F, ippi_DCT_32f(src, src_step, dst, dst_step, width, height, isInverse, isRowTransform))
 
@@ -4183,69 +4155,65 @@ public:
                 prev_len = len;
             }
             // otherwise reuse the tables calculated on the previous stage
-            for(int i = 0; i < count; i++ )
+            for(unsigned i = 0; i < static_cast<unsigned>(count); i++ )
             {
-                dct_func( opt, sptr + i*sstep0, (int)sstep1, src_dft_buf, dst_dft_buf,
-                          dptr + i*dstep0, (int)dstep1, dct_wave);
+                dct_func( opt, sptr + i*sstep0, sstep1, src_dft_buf, dst_dft_buf,
+                          dptr + i*dstep0, dstep1, dct_wave);
             }
             src = dst;
             src_step = dst_step;
         }
-
     }
-    void free() {}
+};
+
+struct ReplacementDCT2D : public hal::DCT2D
+{
+    cvhalDFT *context;
+    bool isInitialized;
+
+    ReplacementDCT2D() : context(0), isInitialized(false) {}
+    bool init(int width, int height, int depth, int flags)
+    {
+        int res = hal_ni_dctInit2D(&context, width, height, depth, flags);
+        isInitialized = (res == CV_HAL_ERROR_OK);
+        return isInitialized;
+    }
+    void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step)
+    {
+        if (isInitialized)
+        {
+            CALL_HAL(dct2D, cv_hal_dct2D, context, src_data, src_step, dst_data, dst_step);
+        }
+    }
+    ~ReplacementDCT2D()
+    {
+        if (isInitialized)
+        {
+            CALL_HAL(dctFree2D, cv_hal_dctFree2D, context);
+        }
+    }
 };
 
 namespace hal {
 
-void dctInit2D(DftContext & c, int width, int height, int depth, int flags)
+Ptr<DCT2D> DCT2D::create(int width, int height, int depth, int flags)
 {
-    int res = cv_hal_dctInit2D(&c.impl, width, height, depth, flags);
-    if (res == CV_HAL_ERROR_OK)
     {
-        c.useReplacement = true;
-        return;
-    }
-    c.useReplacement = false;
-    OcvDctImpl * impl = new OcvDctImpl();
-    impl->init(width, height, depth, flags);
-    c.impl = impl;
-}
-
-void dct2D(const DftContext & c, const void * src, int src_step, void * dst, int dst_step)
-{
-    if (c.useReplacement)
-    {
-        int res = cv_hal_dct2D(c.impl, src, src_step, dst, dst_step);
-        if (res != CV_HAL_ERROR_OK)
+        ReplacementDCT2D *impl = new ReplacementDCT2D();
+        if (impl->init(width, height, depth, flags))
         {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctRun");
+            return Ptr<DCT2D>(impl);
         }
-        return;
+        delete impl;
     }
-    OcvDctImpl * impl = (OcvDctImpl*)c.impl;
-    impl->run((uchar*)src, src_step, (uchar*)dst, dst_step);
-}
-
-void dctFree2D(DftContext & c)
-{
-    if (c.useReplacement)
     {
-        int res = cv_hal_dctFree2D(c.impl);
-        if (res != CV_HAL_ERROR_OK)
-        {
-            CV_Error( CV_StsNotImplemented, "Custom HAL implementation failed to call dctFree");
-        }
-        return;
+        OcvDctImpl *impl = new OcvDctImpl();
+        impl->init(width, height, depth, flags);
+        return Ptr<DCT2D>(impl);
     }
-    OcvDctImpl * impl = (OcvDctImpl*)c.impl;
-    impl->free();
-    delete impl;
-    c.impl = 0;
 }
 
 } // cv::hal::
-
 } // cv::
 
 void cv::dct( InputArray _src0, OutputArray _dst, int flags )
@@ -4265,10 +4233,8 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
     if (src.isContinuous() && dst.isContinuous())
         f |= CV_HAL_DFT_IS_CONTINUOUS;
 
-    hal::DftContext c;
-    hal::dctInit2D(c, src.cols, src.rows, depth, f);
-    hal::dct2D(c, (void*)src.data, (int)src.step, (void*)dst.data, (int)dst.step);
-    hal::dctFree2D(c);
+    Ptr<hal::DCT2D> c = hal::DCT2D::create(src.cols, src.rows, depth, f);
+    c->apply(src.data, src.step, dst.data, dst.step);
 }
 
 
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index bbf32f39d..93476c459 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -376,6 +376,102 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 #define cv_hal_merge64s hal_ni_merge64s
 //! @endcond
 
+/**
+@brief Dummy structure storing DFT/DCT context
+
+Users can convert this pointer to any type they want. Initialisation and destruction should be made in Init and Free function implementations correspondingly.
+Example:
+@code{.cpp}
+int my_hal_dftInit2D(cvhalDFT **context, ...) {
+    *context = static_cast<cvhalDFT*>(new MyFilterData());
+    //... init
+}
+
+int my_hal_dftFree2D(cvhalDFT *context) {
+    MyFilterData *c = static_cast<MyFilterData*>(context);
+    delete c;
+}
+@endcode
+ */
+struct cvhalDFT {};
+
+/**
+@param context double pointer to context storing all necessary data
+@param len transformed array length
+@param count estimated transformation count
+@param depth array type (CV_32F or CV_64F)
+@param flags algorithm options (combination of CV_HAL_DFT_INVERSE, CV_HAL_DFT_SCALE, ...)
+@param needBuffer pointer to boolean variable, if valid pointer provided, then variable value should be set to true to signal that additional memory buffer is needed for operations
+ */
+inline int hal_ni_dftInit1D(cvhalDFT **context, int len, int count, int depth, int flags, bool *needBuffer) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+@param src source data
+@param dst destination data
+ */
+inline int hal_ni_dft1D(cvhalDFT *context, const uchar *src, uchar *dst) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+ */
+inline int hal_ni_dftFree1D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_dftInit1D hal_ni_dftInit1D
+#define cv_hal_dft1D hal_ni_dft1D
+#define cv_hal_dftFree1D hal_ni_dftFree1D
+//! @endcond
+
+/**
+@param context double pointer to context storing all necessary data
+@param width,height image dimensions
+@param depth image type (CV_32F or CV64F)
+@param src_channels number of channels in input image
+@param dst_channels number of channels in output image
+@param flags algorithm options (combination of CV_HAL_DFT_INVERSE, ...)
+@param nonzero_rows number of nonzero rows in image, can be used for optimization
+ */
+inline int hal_ni_dftInit2D(cvhalDFT **context, int width, int height, int depth, int src_channels, int dst_channels, int flags, int nonzero_rows) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+@param src_data,src_step source image data and step
+@param dst_data,dst_step destination image data and step
+ */
+inline int hal_ni_dft2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+ */
+inline int hal_ni_dftFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_dftInit2D hal_ni_dftInit2D
+#define cv_hal_dft2D hal_ni_dft2D
+#define cv_hal_dftFree2D hal_ni_dftFree2D
+//! @endcond
+
+/**
+@param context double pointer to context storing all necessary data
+@param width,height image dimensions
+@param depth image type (CV_32F or CV64F)
+@param flags algorithm options (combination of CV_HAL_DFT_INVERSE, ...)
+ */
+inline int hal_ni_dctInit2D(cvhalDFT **context, int width, int height, int depth, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+@param src_data,src_step source image data and step
+@param dst_data,dst_step destination image data and step
+ */
+inline int hal_ni_dct2D(cvhalDFT *context, const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+@param context pointer to context storing all necessary data
+ */
+inline int hal_ni_dctFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_dctInit2D hal_ni_dctInit2D
+#define cv_hal_dct2D hal_ni_dct2D
+#define cv_hal_dctFree2D hal_ni_dctFree2D
+//! @endcond
+
 //! @}
 
 #if defined __GNUC__
@@ -384,31 +480,6 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 #  pragma warning( pop )
 #endif
 
-inline int hal_ni_dftInit1D(void**, int, int, int, int, bool*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dft1D(const void*, const void*, void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dftFree1D(void*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-
-#define cv_hal_dftInit1D hal_ni_dftInit1D
-#define cv_hal_dft1D hal_ni_dft1D
-#define cv_hal_dftFree1D hal_ni_dftFree1D
-
-inline int hal_ni_dftInit2D(void **, int, int, int, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dft2D(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dftFree2D(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-
-#define cv_hal_dftInit2D hal_ni_dftInit2D
-#define cv_hal_dft2D hal_ni_dft2D
-#define cv_hal_dftFree2D hal_ni_dftFree2D
-
-
-inline int hal_ni_dctInit2D(void **, int, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dct2D(const void *, const void *, int, void *, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_dctFree2D(void *) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-
-#define cv_hal_dctInit2D hal_ni_dctInit2D
-#define cv_hal_dct2D hal_ni_dct2D
-#define cv_hal_dctFree2D hal_ni_dctFree2D
-
 #include "custom_hal.hpp"
 
 #endif
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 4e8958279..019c41f33 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -700,8 +700,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
 
     buf.resize(bufSize);
 
-    hal::DftContext c;
-    hal::dftInit2D(c, dftsize.width, dftsize.height, dftTempl.depth(), 1, 1, CV_HAL_DFT_IS_INPLACE, templ.rows);
+    Ptr<hal::DFT2D> c = hal::DFT2D::create(dftsize.width, dftsize.height, dftTempl.depth(), 1, 1, CV_HAL_DFT_IS_INPLACE, templ.rows);
 
     // compute DFT of each template plane
     for( k = 0; k < tcn; k++ )
@@ -726,11 +725,9 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             Mat part(dst, Range(0, templ.rows), Range(templ.cols, dst.cols));
             part = Scalar::all(0);
         }
-        hal::dft2D(c, dst.data, (int)dst.step, dst.data, (int)dst.step);
+        c->apply(dst.data, (int)dst.step, dst.data, (int)dst.step);
     }
 
-    hal::dftFree2D(c);
-
     int tileCountX = (corr.cols + blocksize.width - 1)/blocksize.width;
     int tileCountY = (corr.rows + blocksize.height - 1)/blocksize.height;
     int tileCount = tileCountX * tileCountY;
@@ -747,11 +744,11 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
     }
     borderType |= BORDER_ISOLATED;
 
-    hal::DftContext cF, cR;
+    Ptr<hal::DFT2D> cF, cR;
     int f = CV_HAL_DFT_IS_INPLACE;
     int f_inv = f | CV_HAL_DFT_INVERSE | CV_HAL_DFT_SCALE;
-    hal::dftInit2D(cF, dftsize.width, dftsize.height, maxDepth, 1, 1, f, blocksize.height + templ.rows - 1);
-    hal::dftInit2D(cR, dftsize.width, dftsize.height, maxDepth, 1, 1, f_inv, blocksize.height);
+    cF = hal::DFT2D::create(dftsize.width, dftsize.height, maxDepth, 1, 1, f, blocksize.height + templ.rows - 1);
+    cR = hal::DFT2D::create(dftsize.width, dftsize.height, maxDepth, 1, 1, f_inv, blocksize.height);
 
     // calculate correlation by blocks
     for( i = 0; i < tileCount; i++ )
@@ -791,7 +788,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                                x1-x0, dst.cols-dst1.cols-(x1-x0), borderType);
 
             if (bsz.height == blocksize.height)
-                hal::dft2D(cF, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+                cF->apply(dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
             else
                 dft( dftImg, dftImg, 0, dsz.height );
 
@@ -800,7 +797,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             mulSpectrums(dftImg, dftTempl1, dftImg, 0, true);
 
             if (bsz.height == blocksize.height)
-                hal::dft2D(cR, dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
+                cR->apply(dftImg.data, (int)dftImg.step, dftImg.data, (int)dftImg.step);
             else
                 dft( dftImg, dftImg, DFT_INVERSE + DFT_SCALE, bsz.height );
 
@@ -834,8 +831,6 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
             }
         }
     }
-    hal::dftFree2D(cF);
-    hal::dftFree2D(cR);
 }
 
 static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _result, int method, InputArray _mask )

From 11378fcb178b6d86b90f9d3a096da34f2176e996 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Tue, 19 Apr 2016 14:50:07 +0300
Subject: [PATCH 6/7] Fixed compiation problems

---
 modules/core/include/opencv2/core/hal/hal.hpp |  1 +
 modules/core/src/dxt.cpp                      | 44 +++++++++----------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp
index 5b01cbe4c..09bcd72d5 100644
--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -46,6 +46,7 @@
 #define __OPENCV_HAL_HPP__
 
 #include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
 #include "opencv2/core/hal/interface.h"
 
 //! @cond IGNORED
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index 2cff51d5a..164b0f10a 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -1553,7 +1553,7 @@ class Dft_C_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_C_IPPLoop_Invoker(const uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+    Dft_C_IPPLoop_Invoker(const uchar * _src, size_t _src_step, uchar * _dst, size_t _dst_step, int _width,
                           const Dft& _ippidft, int _norm_flag, bool *_ok) :
         ParallelLoopBody(),
         src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
@@ -1618,9 +1618,9 @@ public:
 
 private:
     const uchar * src;
-    int src_step;
+    size_t src_step;
     uchar * dst;
-    int dst_step;
+    size_t dst_step;
     int width;
     const Dft& ippidft;
     int norm_flag;
@@ -1634,7 +1634,7 @@ class Dft_R_IPPLoop_Invoker : public ParallelLoopBody
 {
 public:
 
-    Dft_R_IPPLoop_Invoker(const uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width,
+    Dft_R_IPPLoop_Invoker(const uchar * _src, size_t _src_step, uchar * _dst, size_t _dst_step, int _width,
                           const Dft& _ippidft, int _norm_flag, bool *_ok) :
         ParallelLoopBody(),
         src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width),
@@ -1699,9 +1699,9 @@ public:
 
 private:
     const uchar * src;
-    int src_step;
+    size_t src_step;
     uchar * dst;
-    int dst_step;
+    size_t dst_step;
     int width;
     const Dft& ippidft;
     int norm_flag;
@@ -1711,7 +1711,7 @@ private:
 };
 
 template <typename Dft>
-bool Dft_C_IPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
+bool Dft_C_IPPLoop(const uchar * src, size_t src_step, uchar * dst, size_t dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
     parallel_for_(Range(0, height), Dft_C_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
@@ -1719,7 +1719,7 @@ bool Dft_C_IPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, i
 }
 
 template <typename Dft>
-bool Dft_R_IPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, const Dft& ippidft, int norm_flag)
+bool Dft_R_IPPLoop(const uchar * src, size_t src_step, uchar * dst, size_t dst_step, int width, int height, const Dft& ippidft, int norm_flag)
 {
     bool ok;
     parallel_for_(Range(0, height), Dft_R_IPPLoop_Invoker<Dft>(src, src_step, dst, dst_step, width, ippidft, norm_flag, &ok), (width * height)/(double)(1<<16) );
@@ -1730,9 +1730,9 @@ struct IPPDFT_C_Functor
 {
     IPPDFT_C_Functor(ippiDFT_C_Func _func) : func(_func){}
 
-    bool operator()(const Ipp32fc* src, int srcStep, Ipp32fc* dst, int dstStep, const IppiDFTSpec_C_32fc* pDFTSpec, Ipp8u* pBuffer) const
+    bool operator()(const Ipp32fc* src, size_t srcStep, Ipp32fc* dst, size_t dstStep, const IppiDFTSpec_C_32fc* pDFTSpec, Ipp8u* pBuffer) const
     {
-        return func ? func(src, srcStep, dst, dstStep, pDFTSpec, pBuffer) >= 0 : false;
+        return func ? func(src, static_cast<int>(srcStep), dst, static_cast<int>(dstStep), pDFTSpec, pBuffer) >= 0 : false;
     }
 private:
     ippiDFT_C_Func func;
@@ -1742,15 +1742,15 @@ struct IPPDFT_R_Functor
 {
     IPPDFT_R_Functor(ippiDFT_R_Func _func) : func(_func){}
 
-    bool operator()(const Ipp32f* src, int srcStep, Ipp32f* dst, int dstStep, const IppiDFTSpec_R_32f* pDFTSpec, Ipp8u* pBuffer) const
+    bool operator()(const Ipp32f* src, size_t srcStep, Ipp32f* dst, size_t dstStep, const IppiDFTSpec_R_32f* pDFTSpec, Ipp8u* pBuffer) const
     {
-        return func ? func(src, srcStep, dst, dstStep, pDFTSpec, pBuffer) >= 0 : false;
+        return func ? func(src, static_cast<int>(srcStep), dst, static_cast<int>(dstStep), pDFTSpec, pBuffer) >= 0 : false;
     }
 private:
     ippiDFT_R_Func func;
 };
 
-static bool ippi_DFT_C_32F(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
+static bool ippi_DFT_C_32F(const uchar * src, size_t src_step, uchar * dst, size_t dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -1787,9 +1787,9 @@ static bool ippi_DFT_C_32F(const uchar * src, int src_step, uchar * dst, int dst
     }
 
     if (!inv)
-        status = ippiDFTFwd_CToC_32fc_C1R( (Ipp32fc*)src, src_step, (Ipp32fc*)dst, dst_step, pDFTSpec, pBuffer );
+        status = ippiDFTFwd_CToC_32fc_C1R( (Ipp32fc*)src, static_cast<int>(src_step), (Ipp32fc*)dst, static_cast<int>(dst_step), pDFTSpec, pBuffer );
     else
-        status = ippiDFTInv_CToC_32fc_C1R( (Ipp32fc*)src, src_step, (Ipp32fc*)dst, dst_step, pDFTSpec, pBuffer );
+        status = ippiDFTInv_CToC_32fc_C1R( (Ipp32fc*)src, static_cast<int>(src_step), (Ipp32fc*)dst, static_cast<int>(dst_step), pDFTSpec, pBuffer );
 
     if ( sizeBuffer > 0 )
         ippFree( pBuffer );
@@ -1804,7 +1804,7 @@ static bool ippi_DFT_C_32F(const uchar * src, int src_step, uchar * dst, int dst
     return false;
 }
 
-static bool ippi_DFT_R_32F(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, int norm_flag)
+static bool ippi_DFT_R_32F(const uchar * src, size_t src_step, uchar * dst, size_t dst_step, int width, int height, bool inv, int norm_flag)
 {
     IppStatus status;
     Ipp8u* pBuffer = 0;
@@ -1841,9 +1841,9 @@ static bool ippi_DFT_R_32F(const uchar * src, int src_step, uchar * dst, int dst
     }
 
     if (!inv)
-        status = ippiDFTFwd_RToPack_32f_C1R( (float*)src, src_step, (float*)dst, dst_step, pDFTSpec, pBuffer );
+        status = ippiDFTFwd_RToPack_32f_C1R( (float*)src, static_cast<int>(src_step), (float*)dst, static_cast<int>(dst_step), pDFTSpec, pBuffer );
     else
-        status = ippiDFTInv_PackToR_32f_C1R( (float*)src, src_step, (float*)dst, dst_step, pDFTSpec, pBuffer );
+        status = ippiDFTInv_PackToR_32f_C1R( (float*)src, static_cast<int>(src_step), (float*)dst, static_cast<int>(dst_step), pDFTSpec, pBuffer );
 
     if ( sizeBuffer > 0 )
         ippFree( pBuffer );
@@ -2487,7 +2487,7 @@ namespace cv
 {
 
 template <typename T>
-static void complementComplex(T * ptr, int step, int n, int len, int dft_dims)
+static void complementComplex(T * ptr, size_t step, int n, int len, int dft_dims)
 {
     T* p0 = (T*)ptr;
     size_t dstep = step/sizeof(p0[0]);
@@ -2504,7 +2504,7 @@ static void complementComplex(T * ptr, int step, int n, int len, int dft_dims)
     }
 }
 
-static void complementComplexOutput(int depth, uchar * ptr, int step, int count, int len, int dft_dims)
+static void complementComplexOutput(int depth, uchar * ptr, size_t step, int count, int len, int dft_dims)
 {
     if( depth == CV_32F )
         complementComplex((float*)ptr, step, count, len, dft_dims);
@@ -2862,7 +2862,7 @@ public:
 
 protected:
 
-    void rowDft(const uchar* src_data, int src_step, uchar* dst_data, int dst_step, bool isComplex, bool isLastStage)
+    void rowDft(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, bool isComplex, bool isLastStage)
     {
         int len, count;
         if (width == 1 && !isRowTransform )
@@ -2916,7 +2916,7 @@ protected:
             complementComplexOutput(depth, dst_data, dst_step, len, nz, 1);
     }
 
-    void colDft(const uchar* src_data, int src_step, uchar* dst_data, int dst_step, int stage_src_channels, int stage_dst_channels, bool isLastStage)
+    void colDft(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int stage_src_channels, int stage_dst_channels, bool isLastStage)
     {
         int len = height;
         int count = width;

From 5a938309c1feba9fe81c23c17cd364943f7177cc Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Tue, 19 Apr 2016 16:08:48 +0300
Subject: [PATCH 7/7] More compilation warnings fixed

---
 modules/core/src/dxt.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index 164b0f10a..162052667 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -3787,7 +3787,7 @@ typedef IppStatus (CV_STDCALL * ippiDCTGetBufSize)(const void*, int*);
 class DctIPPLoop_Invoker : public ParallelLoopBody
 {
 public:
-    DctIPPLoop_Invoker(const uchar * _src, int _src_step, uchar * _dst, int _dst_step, int _width, bool _inv, bool *_ok) :
+    DctIPPLoop_Invoker(const uchar * _src, size_t _src_step, uchar * _dst, size_t _dst_step, int _width, bool _inv, bool *_ok) :
         ParallelLoopBody(), src(_src), src_step(_src_step), dst(_dst), dst_step(_dst_step), width(_width), inv(_inv), ok(_ok)
     {
         *ok = true;
@@ -3856,7 +3856,7 @@ public:
 
         for(int i = range.start; i < range.end; ++i)
         {
-            if(ippDctFun((float*)(src + src_step * i), src_step, (float*)(dst + dst_step * i), dst_step, pDCTSpec, pBuffer) < 0)
+            if(ippDctFun((float*)(src + src_step * i), static_cast<int>(src_step), (float*)(dst + dst_step * i), static_cast<int>(dst_step), pDCTSpec, pBuffer) < 0)
             {
                 *ok = false;
                 IPP_RETURN
@@ -3886,7 +3886,7 @@ public:
 
             for( int i = range.start; i < range.end; ++i)
             {
-                if(ippDctFun((float*)(src + src_step * i), src_step, (float*)(dst + dst_step * i), dst_step, pDCTSpec, (Ipp8u*)pBuffer) < 0)
+                if(ippDctFun((float*)(src + src_step * i), static_cast<int>(src_step), (float*)(dst + dst_step * i), static_cast<int>(dst_step), pDCTSpec, (Ipp8u*)pBuffer) < 0)
                 {
                     *ok = false;
                     break;
@@ -3908,22 +3908,22 @@ public:
 
 private:
     const uchar * src;
-    int src_step;
+    size_t src_step;
     uchar * dst;
-    int dst_step;
+    size_t dst_step;
     int width;
     bool inv;
     bool *ok;
 };
 
-static bool DctIPPLoop(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv)
+static bool DctIPPLoop(const uchar * src, size_t src_step, uchar * dst, size_t dst_step, int width, int height, bool inv)
 {
     bool ok;
     parallel_for_(Range(0, height), DctIPPLoop_Invoker(src, src_step, dst, dst_step, width, inv, &ok), height/(double)(1<<4) );
     return ok;
 }
 
-static bool ippi_DCT_32f(const uchar * src, int src_step, uchar * dst, int dst_step, int width, int height, bool inv, bool row)
+static bool ippi_DCT_32f(const uchar * src, size_t src_step, uchar * dst, size_t dst_step, int width, int height, bool inv, bool row)
 {
     if(row)
         return DctIPPLoop(src, src_step, dst, dst_step, width, height, inv);
@@ -3978,7 +3978,7 @@ static bool ippi_DCT_32f(const uchar * src, int src_step, uchar * dst, int dst_s
             return false;
         }
 
-        if(ippDctFun((float*)src, src_step, (float*)dst, dst_step, pDCTSpec, pBuffer) < 0)
+        if(ippDctFun((float*)src, static_cast<int>(src_step), (float*)dst, static_cast<int>(dst_step), pDCTSpec, pBuffer) < 0)
         {
             IPP_RELEASE
             return false;
@@ -4010,7 +4010,7 @@ static bool ippi_DCT_32f(const uchar * src, int src_step, uchar * dst, int dst_s
             buf.allocate( bufSize );
             pBuffer = (uchar*)buf;
 
-            status = ippDctFun((float*)src, src_step, (float*)dst, dst_step, pDCTSpec, (Ipp8u*)pBuffer);
+            status = ippDctFun((float*)src, static_cast<int>(src_step), (float*)dst, static_cast<int>(dst_step), pDCTSpec, (Ipp8u*)pBuffer);
         }
 
         if (pDCTSpec)