Merge remote-tracking branch 'origin/master' into merge-2.4

2014-08-01 15:01:41 +04:00
parent a3bde36c84 345b69d5f7
commit 983e75e5de
450 changed files with 11579 additions and 13407 deletions
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@@ -2981,7 +2981,7 @@ The class provides the following features for all derived classes:

    * so called "virtual constructor". That is, each Algorithm derivative is registered at program start and you can get the list of registered algorithms and create instance of a particular algorithm by its name (see ``Algorithm::create``). If you plan to add your own algorithms, it is good practice to add a unique prefix to your algorithms to distinguish them from other algorithms.

-    * setting/retrieving algorithm parameters by name. If you used video capturing functionality from OpenCV highgui module, you are probably familar with ``cvSetCaptureProperty()``, ``cvGetCaptureProperty()``, ``VideoCapture::set()`` and ``VideoCapture::get()``. ``Algorithm`` provides similar method where instead of integer id's you specify the parameter names as text strings. See ``Algorithm::set`` and ``Algorithm::get`` for details.
+    * setting/retrieving algorithm parameters by name. If you used video capturing functionality from OpenCV videoio module, you are probably familar with ``cvSetCaptureProperty()``, ``cvGetCaptureProperty()``, ``VideoCapture::set()`` and ``VideoCapture::get()``. ``Algorithm`` provides similar method where instead of integer id's you specify the parameter names as text strings. See ``Algorithm::set`` and ``Algorithm::get`` for details.

    * reading and writing parameters from/to XML or YAML files. Every Algorithm derivative can store all its parameters and then read them back. There is no need to re-implement it each time.

--- a/modules/core/doc/drawing_functions.rst
+++ b/modules/core/doc/drawing_functions.rst
@@ -361,6 +361,37 @@ The function ``line`` draws the line segment between ``pt1`` and ``pt2`` points
 Antialiased lines are drawn using Gaussian filtering.


+arrowedLine
+----------------
+Draws a arrow segment pointing from the first point to the second one.
+
+.. ocv:function:: void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,          int thickness=1, int lineType=8, int shift=0, double tipLength=0.1)
+
+    :param img: Image.
+
+    :param pt1: The point the arrow starts from.
+
+    :param pt2: The point the arrow points to.
+
+    :param color: Line color.
+
+    :param thickness: Line thickness.
+
+    :param lineType: Type of the line:
+
+            * **8** (or omitted) - 8-connected line.
+
+            * **4** - 4-connected line.
+
+            * **CV_AA** - antialiased line.
+
+    :param shift: Number of fractional bits in the point coordinates.
+
+    :param tipLength: The length of the arrow tip in relation to the arrow length
+
+The function ``arrowedLine`` draws an arrow between ``pt1`` and ``pt2`` points in the image. See also :ocv:func:`line`.
+
+
 LineIterator
 ------------
 .. ocv:class:: LineIterator
--- a/modules/core/doc/intro.rst
+++ b/modules/core/doc/intro.rst
@@ -14,7 +14,8 @@ OpenCV has a modular structure, which means that the package includes several sh
 * **calib3d** - basic multiple-view geometry algorithms, single and stereo camera calibration, object pose estimation, stereo correspondence algorithms, and elements of 3D reconstruction.
 * **features2d** - salient feature detectors, descriptors, and descriptor matchers.
 * **objdetect** - detection of objects and instances of the predefined classes (for example, faces, eyes, mugs, people, cars, and so on).
- * **highgui** - an easy-to-use interface to video capturing, image and video codecs, as well as simple UI capabilities.
+ * **highgui** - an easy-to-use interface to simple UI capabilities.
+ * **videoio** - an easy-to-use interface to video capturing and video codecs.
 * **gpu** - GPU-accelerated algorithms from different OpenCV modules.
 * ... some other helper modules, such as FLANN and Google test wrappers, Python bindings, and others.

--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -510,6 +510,10 @@ CV_EXPORTS_W void randShuffle(InputOutputArray dst, double iterFactor = 1., RNG*
 CV_EXPORTS_W void line(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
                     int thickness = 1, int lineType = LINE_8, int shift = 0);

+//! draws an arrow from pt1 to pt2 in the image
+CV_EXPORTS_W void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
+                     int thickness=1, int line_type=8, int shift=0, double tipLength=0.1);
+
 //! draws the rectangle outline or a solid rectangle with the opposite corners pt1 and pt2 in the image
 CV_EXPORTS_W void rectangle(InputOutputArray img, Point pt1, Point pt2,
                          const Scalar& color, int thickness = 1,
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -244,6 +244,7 @@ typedef signed char schar;

 /* fundamental constants */
 #define CV_PI   3.1415926535897932384626433832795
+#define CV_2PI 6.283185307179586476925286766559
 #define CV_LOG2 0.69314718055994530941723212145818

 /****************************************************************************************\
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -360,7 +360,7 @@ struct CV_EXPORTS UMatData
 {
    enum { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2,
        DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24,
-        USER_ALLOCATED=32 };
+        USER_ALLOCATED=32, DEVICE_MEM_MAPPED=64};
    UMatData(const MatAllocator* allocator);
    ~UMatData();

@@ -370,11 +370,13 @@ struct CV_EXPORTS UMatData

    bool hostCopyObsolete() const;
    bool deviceCopyObsolete() const;
+    bool deviceMemMapped() const;
    bool copyOnMap() const;
    bool tempUMat() const;
    bool tempCopiedUMat() const;
    void markHostCopyObsolete(bool flag);
    void markDeviceCopyObsolete(bool flag);
+    void markDeviceMemMapped(bool flag);

    const MatAllocator* prevAllocator;
    const MatAllocator* currAllocator;
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -3351,10 +3351,19 @@ size_t UMat::total() const

 inline bool UMatData::hostCopyObsolete() const { return (flags & HOST_COPY_OBSOLETE) != 0; }
 inline bool UMatData::deviceCopyObsolete() const { return (flags & DEVICE_COPY_OBSOLETE) != 0; }
+inline bool UMatData::deviceMemMapped() const { return (flags & DEVICE_MEM_MAPPED) != 0; }
 inline bool UMatData::copyOnMap() const { return (flags & COPY_ON_MAP) != 0; }
 inline bool UMatData::tempUMat() const { return (flags & TEMP_UMAT) != 0; }
 inline bool UMatData::tempCopiedUMat() const { return (flags & TEMP_COPIED_UMAT) == TEMP_COPIED_UMAT; }

+inline void UMatData::markDeviceMemMapped(bool flag)
+{
+  if(flag)
+    flags |= DEVICE_MEM_MAPPED;
+  else
+    flags &= ~DEVICE_MEM_MAPPED;
+}
+
 inline void UMatData::markHostCopyObsolete(bool flag)
 {
    if(flag)
--- a/modules/core/perf/opencl/perf_dxt.cpp
+++ b/modules/core/perf/opencl/perf_dxt.cpp
@@ -54,23 +54,42 @@ namespace ocl {

 ///////////// dft ////////////////////////

-typedef tuple<Size, int> DftParams;
+enum OCL_FFT_TYPE
+{
+    R2R = 0,
+    C2R = 1,
+    R2C = 2,
+    C2C = 3
+};
+
+typedef tuple<OCL_FFT_TYPE, Size, int> DftParams;
 typedef TestBaseWithParam<DftParams> DftFixture;

-OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
-                                                Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
-                                                       (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
+OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C),
+                                                    Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(512, 512), Size(1024, 1024), Size(2048, 2048)),
+                                                    Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
+                                                           (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
 {
    const DftParams params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int flags = get<1>(params);
+    const int dft_type = get<0>(params);
+    const Size srcSize = get<1>(params);
+    int flags = get<2>(params);

-    UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2);
+    int in_cn, out_cn;
+    switch (dft_type)
+    {
+    case R2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 1; out_cn = 1; break;
+    case C2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 2; out_cn = 2; break;
+    case R2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 1; out_cn = 2; break;
+    case C2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 2; out_cn = 2; break;
+    }
+
+    UMat src(srcSize, CV_MAKE_TYPE(CV_32F, in_cn)), dst(srcSize, CV_MAKE_TYPE(CV_32F, out_cn));
    declare.in(src, WARMUP_RNG).out(dst);

-    OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT);
+    OCL_TEST_CYCLE() cv::dft(src, dst, flags);

-    SANITY_CHECK(dst, 1e-3);
+    SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
 }

 ///////////// MulSpectrums ////////////////////////
--- a/modules/core/perf/opencl/perf_matop.cpp
+++ b/modules/core/perf/opencl/perf_matop.cpp
@@ -139,6 +139,7 @@ OCL_PERF_TEST_P(CopyToFixture, CopyToWithMaskUninit,
        dst.release();
        startTimer();
        src.copyTo(dst, mask);
+        cv::ocl::finish();
        stopTimer();
    }

--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -54,21 +54,23 @@ namespace cv

 struct NOP {};

-#if CV_SSE2
+#if CV_SSE2 || CV_NEON

 #define FUNCTOR_TEMPLATE(name)          \
    template<typename T> struct name {}

 FUNCTOR_TEMPLATE(VLoadStore128);
+#if CV_SSE2
 FUNCTOR_TEMPLATE(VLoadStore64);
 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
+#endif

 #endif

 template<typename T, class Op, class VOp>
 void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
 {
-#if CV_SSE2
+#if CV_SSE2 || CV_NEON
    VOp vop;
 #endif
    Op op;
@@ -79,9 +81,11 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
    {
        int x = 0;

+#if CV_NEON || CV_SSE2
 #if CV_SSE2
        if( USE_SSE2 )
        {
+#endif
            for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
            {
                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
@@ -91,8 +95,10 @@ void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, si
                VLoadStore128<T>::store(dst + x               , r0);
                VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
            }
+#if CV_SSE2
        }
 #endif
+#endif
 #if CV_SSE2
        if( USE_SSE2 )
        {
@@ -125,7 +131,7 @@ template<typename T, class Op, class Op32>
 void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
              T* dst, size_t step, Size sz)
 {
-#if CV_SSE2
+#if CV_SSE2 || CV_NEON
    Op32 op32;
 #endif
    Op op;
@@ -153,9 +159,11 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
            }
        }
 #endif
+#if CV_NEON || CV_SSE2
 #if CV_SSE2
        if( USE_SSE2 )
        {
+#endif
            for( ; x <= sz.width - 8; x += 8 )
            {
                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
@@ -165,8 +173,10 @@ void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
                VLoadStore128<T>::store(dst + x    , r0);
                VLoadStore128<T>::store(dst + x + 4, r1);
            }
+#if CV_SSE2
        }
 #endif
+#endif
 #if CV_ENABLE_UNROLLED
        for( ; x <= sz.width - 4; x += 4 )
        {
@@ -383,7 +393,98 @@ FUNCTOR_TEMPLATE(VNot);
 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
 #endif

-#if CV_SSE2
+#if CV_NEON
+
+#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
+    template <>                                                                \
+    struct name<template_arg>{                                                 \
+        typedef register_type reg_type;                                        \
+        static reg_type load(const template_arg * p) { return load_body (p);}; \
+        static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
+    }
+
+#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
+    template<>                                                         \
+    struct name<template_arg>                                          \
+    {                                                                  \
+        VLoadStore128<template_arg>::reg_type operator()(              \
+                        VLoadStore128<template_arg>::reg_type a,       \
+                        VLoadStore128<template_arg>::reg_type b) const \
+        {                                                              \
+            return body;                                               \
+        };                                                             \
+    }
+
+#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
+    template<>                                                         \
+    struct name<template_arg>                                          \
+    {                                                                  \
+        VLoadStore128<template_arg>::reg_type operator()(              \
+                        VLoadStore128<template_arg>::reg_type a,       \
+                        VLoadStore128<template_arg>::reg_type  ) const \
+        {                                                              \
+            return body;                                               \
+        };                                                             \
+    }
+
+FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
+FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
+FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
+FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
+FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
+FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
+
+FUNCTOR_TEMPLATE(VAdd);
+FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
+
+FUNCTOR_TEMPLATE(VSub);
+FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
+
+FUNCTOR_TEMPLATE(VMin);
+FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
+
+FUNCTOR_TEMPLATE(VMax);
+FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
+
+FUNCTOR_TEMPLATE(VAbsDiff);
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
+FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
+
+FUNCTOR_TEMPLATE(VAnd);
+FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
+FUNCTOR_TEMPLATE(VOr);
+FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
+FUNCTOR_TEMPLATE(VXor);
+FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
+FUNCTOR_TEMPLATE(VNot);
+FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
+#endif
+
+#if CV_SSE2 || CV_NEON
 #define IF_SIMD(op) op
 #else
 #define IF_SIMD(op) NOP
@@ -1390,6 +1491,9 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
    if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
        return false;

+    if( (oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE) && (depth1 >= CV_32F || depth2 >= CV_32F || ddepth >= CV_32F) )
+        return false;
+
    int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
    int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;

@@ -2980,8 +3084,187 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
 namespace cv
 {

-template<typename T> static void
-inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
+template <typename T>
+struct InRange_SSE
+{
+    int operator () (const T *, const T *, const T *, uchar *, int) const
+    {
+        return 0;
+    }
+};
+
+#if CV_SSE2
+
+template <>
+struct InRange_SSE<uchar>
+{
+    int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_full = _mm_set1_epi8(-1), v_128 = _mm_set1_epi8(-128);
+
+            for ( ; x <= len - 16; x += 16 )
+            {
+                __m128i v_src = _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), v_128);
+                __m128i v_mask1 = _mm_cmpgt_epi8(_mm_add_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_128), v_src);
+                __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src3 + x)), v_128));
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<schar>
+{
+    int operator () (const schar * src1, const schar * src2, const schar * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_full = _mm_set1_epi8(-1);
+
+            for ( ; x <= len - 16; x += 16 )
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
+                __m128i v_mask1 = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
+                __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<ushort>
+{
+    int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1), v_32768 = _mm_set1_epi16(-32768);
+
+            for ( ; x <= len - 8; x += 8 )
+            {
+                __m128i v_src = _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src1 + x)), v_32768);
+                __m128i v_mask1 = _mm_cmpgt_epi16(_mm_add_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_32768), v_src);
+                __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src3 + x)), v_32768));
+                __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<short>
+{
+    int operator () (const short * src1, const short * src2, const short * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1);
+
+            for ( ; x <= len - 8; x += 8 )
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
+                __m128i v_mask1 = _mm_cmpgt_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
+                __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
+                __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<int>
+{
+    int operator () (const int * src1, const int * src2, const int * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi32(-1);
+
+            for ( ; x <= len - 8; x += 8 )
+            {
+                __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
+                __m128i v_res1 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src),
+                    _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))));
+
+                v_src = _mm_loadu_si128((const __m128i *)(src1 + x + 4));
+                __m128i v_res2 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x + 4)), v_src),
+                    _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x + 4))));
+
+                __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(_mm_andnot_si128(v_res1, v_full), 16),
+                                                _mm_srli_epi32(_mm_andnot_si128(v_res2, v_full), 16));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct InRange_SSE<float>
+{
+    int operator () (const float * src1, const float * src2, const float * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        if (USE_SSE2)
+        {
+            __m128i v_zero = _mm_setzero_si128();
+
+            for ( ; x <= len - 8; x += 8 )
+            {
+                __m128 v_src = _mm_loadu_ps(src1 + x);
+                __m128 v_res1 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x), v_src),
+                    _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x)));
+
+                v_src = _mm_loadu_ps(src1 + x + 4);
+                __m128 v_res2 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x + 4), v_src),
+                    _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x + 4)));
+
+                __m128i v_res1i = _mm_cvtps_epi32(v_res1), v_res2i = _mm_cvtps_epi32(v_res2);
+                __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(v_res1i, 16), _mm_srli_epi32(v_res2i, 16));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
+            }
+        }
+
+        return x;
+    }
+};
+
+#endif
+
+template <typename T>
+static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
         const T* src3, size_t step3, uchar* dst, size_t step,
         Size size)
 {
@@ -2989,9 +3272,11 @@ inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
    step2 /= sizeof(src2[0]);
    step3 /= sizeof(src3[0]);

+    InRange_SSE<T> vop;
+
    for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
    {
-        int x = 0;
+        int x = vop(src1, src2, src3, dst, size.width);
        #if CV_ENABLE_UNROLLED
        for( ; x <= size.width - 4; x += 4 )
        {
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1541,7 +1541,7 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
        kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
    bool doubleSupport = d.doubleFPConfig() > 0;

-    if (!doubleSupport && depth == CV_64F)
+    if (depth == CV_32F || depth == CV_64F)
        return false;

    char cvt[2][50];
@@ -1729,22 +1729,18 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
    UMat src = _src.getUMat(), lut = _lut.getUMat();
    _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
    UMat dst = _dst.getUMat();
-    bool bAligned = (1 == lcn) && (0 == (src.offset % 4)) && (0 == ((dcn * src.cols) % 4));
-    // dst.cols == src.cols by params of dst.create
+    int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_dst)) : dcn;

    ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
-                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", bAligned ? 4 : dcn, lcn,
-                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)
-                         ));
+                  format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn,
+                         ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
    if (k.empty())
        return false;

-    int cols = bAligned ? dcn * dst.cols / 4 : dst.cols;
-
    k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
-        ocl::KernelArg::WriteOnlyNoSize(dst), dst.rows, cols);
+        ocl::KernelArg::WriteOnly(dst, dcn, kercn));

-    size_t globalSize[2] = { cols, (dst.rows + 3) / 4 };
+    size_t globalSize[2] = { dst.cols * dcn / kercn, (dst.rows + 3) / 4 };
    return k.run(2, globalSize, NULL, false);
 }

--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -432,7 +432,7 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)

        IppStatus status = (IppStatus)-1;
        IppiSize roisize = { cols, rows };
-        int mstep = (int)mask.step, dstep = (int)step;
+        int mstep = (int)mask.step[0], dstep = (int)step[0];

        if (isContinuous() && mask.isContinuous())
        {
@@ -616,7 +616,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
 {
    CV_Assert(flipCode >= -1 && flipCode <= 1);
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);;
+            flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);

    if (cn > 4)
        return false;
@@ -631,7 +631,7 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )

    ocl::Device dev = ocl::Device::getDefault();
    int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
-    kercn = std::max(kercn, cn);
+    kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;

    ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
        format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
@@ -762,7 +762,7 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
        flipHoriz( dst.data, dst.step, dst.data, dst.step, dst.size(), esz );
 }

-#ifdef HAVE_OPENCL
+/*#ifdef HAVE_OPENCL

 static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
 {
@@ -790,7 +790,7 @@ static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
    return k.run(2, globalsize, NULL, false);
 }

-#endif
+#endif*/

 void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
 {
@@ -800,8 +800,8 @@ void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
    Size ssize = _src.size();
    _dst.create(ssize.height*ny, ssize.width*nx, _src.type());

-    CV_OCL_RUN(_dst.isUMat(),
-               ocl_repeat(_src, ny, nx, _dst))
+    /*CV_OCL_RUN(_dst.isUMat(),
+               ocl_repeat(_src, ny, nx, _dst))*/

    Mat src = _src.getMat(), dst = _dst.getMat();
    Size dsize = dst.size();
--- a/modules/core/src/cuda_buffer_pool.cpp
+++ b/modules/core/src/cuda_buffer_pool.cpp
@@ -207,7 +207,6 @@ namespace
    MemoryStack* MemoryPool::getFreeMemStack()
    {
        AutoLock lock(mtx_);
-
        if (!initialized_)
            initilizeImpl();

@@ -256,22 +255,31 @@ namespace

 namespace
 {
+    Mutex mtx_;
+    bool memory_pool_manager_initialized;
+
    class MemoryPoolManager
    {
    public:
        MemoryPoolManager();
        ~MemoryPoolManager();
+        void Init();

        MemoryPool* getPool(int deviceId);

    private:
        std::vector<MemoryPool> pools_;
-    };
+    } manager;
+
+    //MemoryPoolManager ;

    MemoryPoolManager::MemoryPoolManager()
    {
-        int deviceCount = getCudaEnabledDeviceCount();
+    }

+    void MemoryPoolManager::Init()
+    {
+        int deviceCount = getCudaEnabledDeviceCount();
        if (deviceCount > 0)
            pools_.resize(deviceCount);
    }
@@ -280,7 +288,7 @@ namespace
    {
        for (size_t i = 0; i < pools_.size(); ++i)
        {
-            cudaSetDevice(i);
+            cudaSetDevice(static_cast<int>(i));
            pools_[i].release();
        }
    }
@@ -293,7 +301,14 @@ namespace

    MemoryPool* memPool(int deviceId)
    {
-        static MemoryPoolManager manager;
+        {
+            AutoLock lock(mtx_);
+            if (!memory_pool_manager_initialized)
+            {
+                memory_pool_manager_initialized = true;
+                manager.Init();
+            }
+        }
        return manager.getPool(deviceId);
    }
 }
@@ -311,8 +326,10 @@ cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream),
    if (enableMemoryPool)
    {
        const int deviceId = getDevice();
-        memStack_ = memPool(deviceId)->getFreeMemStack();
-
+        {
+            AutoLock lock(mtx_);
+            memStack_ = memPool(deviceId)->getFreeMemStack();
+        }
        DeviceInfo devInfo(deviceId);
        alignment_ = devInfo.textureAlignment();
    }
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@@ -190,10 +190,22 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
 #endif
 }

+namespace
+{
+    bool default_stream_is_initialized;
+    Mutex mtx;
+    Ptr<Stream> default_stream;
+}
+
 Stream& cv::cuda::Stream::Null()
 {
-    static Stream s(Ptr<Impl>(new Impl(0)));
-    return s;
+    AutoLock lock(mtx);
+    if (!default_stream_is_initialized)
+    {
+        default_stream = Ptr<Stream>(new Stream(Ptr<Impl>(new Impl(0))));
+        default_stream_is_initialized = true;
+    }
+    return *default_stream;
 }

 cv::cuda::Stream::operator bool_type() const
--- a/modules/core/src/drawing.cpp
+++ b/modules/core/src/drawing.cpp
@@ -1584,6 +1584,24 @@ void line( InputOutputArray _img, Point pt1, Point pt2, const Scalar& color,
    ThickLine( img, pt1, pt2, buf, thickness, line_type, 3, shift );
 }

+void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
+           int thickness, int line_type, int shift, double tipLength)
+{
+    const double tipSize = norm(pt1-pt2)*tipLength; // Factor to normalize the size of the tip depending on the length of the arrow
+
+    line(img, pt1, pt2, color, thickness, line_type, shift);
+
+    const double angle = atan2( (double) pt1.y - pt2.y, (double) pt1.x - pt2.x );
+
+    Point p(cvRound(pt2.x + tipSize * cos(angle + CV_PI / 4)),
+        cvRound(pt2.y + tipSize * sin(angle + CV_PI / 4)));
+    line(img, p, pt2, color, thickness, line_type, shift);
+
+    p.x = cvRound(pt2.x + tipSize * cos(angle - CV_PI / 4));
+    p.y = cvRound(pt2.y + tipSize * sin(angle - CV_PI / 4));
+    line(img, p, pt2, color, thickness, line_type, shift);
+}
+
 void rectangle( InputOutputArray _img, Point pt1, Point pt2,
                const Scalar& color, int thickness,
                int lineType, int shift )
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -43,6 +43,7 @@
 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
 #include "opencl_kernels.hpp"
+#include <map>

 namespace cv
 {
@@ -1781,6 +1782,375 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
 #endif
 }

+#ifdef HAVE_OPENCL
+
+namespace cv
+{
+
+enum FftType
+{
+    R2R = 0, // real to CCS in case forward transform, CCS to real otherwise
+    C2R = 1, // complex to real in case inverse transform
+    R2C = 2, // real to complex in case forward transform
+    C2C = 3  // complex to complex
+};
+
+struct OCL_FftPlan
+{
+private:
+    UMat twiddles;
+    String buildOptions;
+    int thread_count;
+    bool status;
+    int dft_size;
+
+public:
+    OCL_FftPlan(int _size): dft_size(_size), status(true)
+    {
+        int min_radix;
+        std::vector<int> radixes, blocks;
+        ocl_getRadixes(dft_size, radixes, blocks, min_radix);
+        thread_count = dft_size / min_radix;
+
+        if (thread_count > (int) ocl::Device::getDefault().maxWorkGroupSize())
+        {
+            status = false;
+            return;
+        }
+
+        // generate string with radix calls
+        String radix_processing;
+        int n = 1, twiddle_size = 0;
+        for (size_t i=0; i<radixes.size(); i++)
+        {
+            int radix = radixes[i], block = blocks[i];
+            if (block > 1)
+                radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix);
+            else
+                radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix);
+            twiddle_size += (radix-1)*n;
+            n *= radix;
+        }
+
+        Mat tw(1, twiddle_size, CV_32FC2);
+        float* ptr = tw.ptr<float>();
+        int ptr_index = 0;
+
+        n = 1;
+        for (size_t i=0; i<radixes.size(); i++)
+        {
+            int radix = radixes[i];
+            n *= radix;
+
+            for (int j=1; j<radix; j++)
+            {
+                double theta = -CV_2PI*j/n;
+
+                for (int k=0; k<(n/radix); k++)
+                {
+                    ptr[ptr_index++] = (float) cos(k*theta);
+                    ptr[ptr_index++] = (float) sin(k*theta);
+                }
+            }
+        }
+        twiddles = tw.getUMat(ACCESS_READ);
+
+        buildOptions = format("-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s",
+                              dft_size, min_radix, radix_processing.c_str());
+    }
+
+    bool enqueueTransform(InputArray _src, OutputArray _dst, int num_dfts, int flags, int fftType, bool rows = true) const
+    {
+        if (!status)
+            return false;
+
+        UMat src = _src.getUMat();
+        UMat dst = _dst.getUMat();
+
+        size_t globalsize[2];
+        size_t localsize[2];
+        String kernel_name;
+
+        bool is1d = (flags & DFT_ROWS) != 0 || num_dfts == 1;
+        bool inv = (flags & DFT_INVERSE) != 0;
+        String options = buildOptions;
+
+        if (rows)
+        {
+            globalsize[0] = thread_count; globalsize[1] = src.rows;
+            localsize[0] = thread_count; localsize[1] = 1;
+            kernel_name = !inv ? "fft_multi_radix_rows" : "ifft_multi_radix_rows";
+            if ((is1d || inv) && (flags & DFT_SCALE))
+                options += " -D DFT_SCALE";
+        }
+        else
+        {
+            globalsize[0] = num_dfts; globalsize[1] = thread_count;
+            localsize[0] = 1; localsize[1] = thread_count;
+            kernel_name = !inv ? "fft_multi_radix_cols" : "ifft_multi_radix_cols";
+            if (flags & DFT_SCALE)
+                options += " -D DFT_SCALE";
+        }
+
+        options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT";
+        options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT";
+        options += is1d ? " -D IS_1D" : "";
+
+        if (!inv)
+        {
+            if ((is1d && src.channels() == 1) || (rows && (fftType == R2R)))
+                options += " -D NO_CONJUGATE";
+        }
+        else
+        {
+            if (rows && (fftType == C2R || fftType == R2R))
+                options += " -D NO_CONJUGATE";
+            if (dst.cols % 2 == 0)
+                options += " -D EVEN";
+        }
+
+        ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options);
+        if (k.empty())
+            return false;
+
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(twiddles), thread_count, num_dfts);
+        return k.run(2, globalsize, localsize, false);
+    }
+
+private:
+    static void ocl_getRadixes(int cols, std::vector<int>& radixes, std::vector<int>& blocks, int& min_radix)
+    {
+        int factors[34];
+        int nf = DFTFactorize(cols, factors);
+
+        int n = 1;
+        int factor_index = 0;
+        min_radix = INT_MAX;
+
+        // 2^n transforms
+        if ((factors[factor_index] & 1) == 0)
+        {
+            for( ; n < factors[factor_index];)
+            {
+                int radix = 2, block = 1;
+                if (8*n <= factors[0])
+                    radix = 8;
+                else if (4*n <= factors[0])
+                {
+                    radix = 4;
+                    if (cols % 12 == 0)
+                        block = 3;
+                    else if (cols % 8 == 0)
+                        block = 2;
+                }
+                else
+                {
+                    if (cols % 10 == 0)
+                        block = 5;
+                    else if (cols % 8 == 0)
+                        block = 4;
+                    else if (cols % 6 == 0)
+                        block = 3;
+                    else if (cols % 4 == 0)
+                        block = 2;
+                }
+
+                radixes.push_back(radix);
+                blocks.push_back(block);
+                min_radix = min(min_radix, block*radix);
+                n *= radix;
+            }
+            factor_index++;
+        }
+
+        // all the other transforms
+        for( ; factor_index < nf; factor_index++)
+        {
+            int radix = factors[factor_index], block = 1;
+            if (radix == 3)
+            {
+                if (cols % 12 == 0)
+                    block = 4;
+                else if (cols % 9 == 0)
+                    block = 3;
+                else if (cols % 6 == 0)
+                    block = 2;
+            }
+            else if (radix == 5)
+            {
+                if (cols % 10 == 0)
+                    block = 2;
+            }
+            radixes.push_back(radix);
+            blocks.push_back(block);
+            min_radix = min(min_radix, block*radix);
+        }
+    }
+};
+
+class OCL_FftPlanCache
+{
+public:
+    static OCL_FftPlanCache & getInstance()
+    {
+        static OCL_FftPlanCache planCache;
+        return planCache;
+    }
+
+    Ptr<OCL_FftPlan> getFftPlan(int dft_size)
+    {
+        std::map<int, Ptr<OCL_FftPlan> >::iterator f = planStorage.find(dft_size);
+        if (f != planStorage.end())
+        {
+            return f->second;
+        }
+        else
+        {
+            Ptr<OCL_FftPlan> newPlan = Ptr<OCL_FftPlan>(new OCL_FftPlan(dft_size));
+            planStorage[dft_size] = newPlan;
+            return newPlan;
+        }
+    }
+
+    ~OCL_FftPlanCache()
+    {
+        planStorage.clear();
+    }
+
+protected:
+    OCL_FftPlanCache() :
+        planStorage()
+    {
+    }
+    std::map<int, Ptr<OCL_FftPlan> > planStorage;
+};
+
+static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)
+{
+    Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols());
+    return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true);
+}
+
+static bool ocl_dft_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType)
+{
+    Ptr<OCL_FftPlan> plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows());
+    return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false);
+}
+
+static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
+{
+    int type = _src.type(), cn = CV_MAT_CN(type);
+    Size ssize = _src.size();
+    if ( !(type == CV_32FC1 || type == CV_32FC2) )
+        return false;
+
+    // if is not a multiplication of prime numbers { 2, 3, 5 }
+    if (ssize.area() != getOptimalDFTSize(ssize.area()))
+        return false;
+
+    UMat src = _src.getUMat();
+    int complex_input = cn == 2 ? 1 : 0;
+    int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0;
+    int real_input = cn == 1 ? 1 : 0;
+    int real_output = (flags & DFT_REAL_OUTPUT) != 0;
+    bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0;
+
+    if( nonzero_rows <= 0 || nonzero_rows > _src.rows() )
+        nonzero_rows = _src.rows();
+    bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1;
+
+    // if output format is not specified
+    if (complex_output + real_output == 0)
+    {
+        if (real_input)
+            real_output = 1;
+        else
+            complex_output = 1;
+    }
+
+    FftType fftType = (FftType)(complex_input << 0 | complex_output << 1);
+
+    // Forward Complex to CCS not supported
+    if (fftType == C2R && !inv)
+        fftType = C2C;
+
+    // Inverse CCS to Complex not supported
+    if (fftType == R2C && inv)
+        fftType = R2R;
+
+    UMat output;
+    if (fftType == C2C || fftType == R2C)
+    {
+        // complex output
+        _dst.create(src.size(), CV_32FC2);
+        output = _dst.getUMat();
+    }
+    else
+    {
+        // real output
+        if (is1d)
+        {
+            _dst.create(src.size(), CV_32FC1);
+            output = _dst.getUMat();
+        }
+        else
+        {
+            _dst.create(src.size(), CV_32FC1);
+            output.create(src.size(), CV_32FC2);
+        }
+    }
+
+    if (!inv)
+    {
+        if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
+            return false;
+
+        if (!is1d)
+        {
+            int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols;
+            if (!ocl_dft_cols(output, _dst, nonzero_cols, flags, fftType))
+                return false;
+        }
+    }
+    else
+    {
+        if (fftType == C2C)
+        {
+            // complex output
+            if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
+                return false;
+
+            if (!is1d)
+            {
+                if (!ocl_dft_cols(output, output, output.cols, flags, fftType))
+                    return false;
+            }
+        }
+        else
+        {
+            if (is1d)
+            {
+                if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType))
+                    return false;
+            }
+            else
+            {
+                int nonzero_cols = src.cols/2 + 1;
+                if (!ocl_dft_cols(src, output, nonzero_cols, flags, fftType))
+                    return false;
+
+                if (!ocl_dft_rows(output, _dst, nonzero_rows, flags, fftType))
+                    return false;
+            }
+        }
+    }
+    return true;
+}
+
+} // namespace cv;
+
+#endif
+
 #ifdef HAVE_CLAMDFFT

 namespace cv {
@@ -1791,14 +2161,6 @@ namespace cv {
        CV_Assert(s == CLFFT_SUCCESS); \
    }

-enum FftType
-{
-    R2R = 0, // real to real
-    C2R = 1, // opencl HERMITIAN_INTERLEAVED to real
-    R2C = 2, // real to opencl HERMITIAN_INTERLEAVED
-    C2C = 3  // complex to complex
-};
-
 class PlanCache
 {
    struct FftPlan
@@ -1923,7 +2285,7 @@ public:
        }

        // no baked plan is found, so let's create a new one
-        FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType);
+        Ptr<FftPlan> newPlan = Ptr<FftPlan>(new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType));
        planStorage.push_back(newPlan);

        return newPlan->plHandle;
@@ -1931,8 +2293,6 @@ public:

    ~PlanCache()
    {
-        for (std::vector<FftPlan *>::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i)
-            delete (*i);
        planStorage.clear();
    }

@@ -1942,7 +2302,7 @@ protected:
    {
    }

-    std::vector<FftPlan *> planStorage;
+    std::vector<Ptr<FftPlan> > planStorage;
 };

 extern "C" {
@@ -1960,7 +2320,7 @@ static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)

 }

-static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)
+static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
 {
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    Size ssize = _src.size();
@@ -2019,7 +2379,6 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)

    tmpBuffer.addref();
    clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u);
-
    return true;
 }

@@ -2034,7 +2393,12 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
 #ifdef HAVE_CLAMDFFT
    CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
            _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
-               ocl_dft(_src0, _dst, flags))
+               ocl_dft_amdfft(_src0, _dst, flags))
+#endif
+
+#ifdef HAVE_OPENCL
+    CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2,
+               ocl_dft(_src0, _dst, flags, nonzero_rows))
 #endif

    static DFTFunc dft_tbl[6] =
@@ -2046,10 +2410,8 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
        (DFTFunc)RealDFT_64f,
        (DFTFunc)CCSIDFT_64f
    };
-
    AutoBuffer<uchar> buf;
    void *spec = 0;
-
    Mat src0 = _src0.getMat(), src = src0;
    int prev_len = 0, stage = 0;
    bool inv = (flags & DFT_INVERSE) != 0;
@@ -2080,32 +2442,32 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
    {
        if ((flags & DFT_ROWS) == 0)
        {
-            if (!real_transform)
+            if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
            {
-                if (ippi_DFT_C_32F(src,dst, inv, ipp_norm_flag))
+                if (ippi_DFT_C_32F(src, dst, inv, ipp_norm_flag))
                    return;
                setIppErrorStatus();
            }
-            else if (inv || !(flags & DFT_COMPLEX_OUTPUT))
+            if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
            {
-                if (ippi_DFT_R_32F(src,dst, inv, ipp_norm_flag))
+                if (ippi_DFT_R_32F(src, dst, inv, ipp_norm_flag))
                    return;
                setIppErrorStatus();
            }
        }
        else
        {
-            if (!real_transform)
+            if (src.channels() == 2 && !(inv && (flags & DFT_REAL_OUTPUT)))
            {
                ippiDFT_C_Func ippiFunc = inv ? (ippiDFT_C_Func)ippiDFTInv_CToC_32fc_C1R : (ippiDFT_C_Func)ippiDFTFwd_CToC_32fc_C1R;
-                if (Dft_C_IPPLoop(src,dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
+                if (Dft_C_IPPLoop(src, dst, IPPDFT_C_Functor(ippiFunc),ipp_norm_flag))
                    return;
                setIppErrorStatus();
            }
-            else if (inv || !(flags & DFT_COMPLEX_OUTPUT))
+            if (src.channels() == 1 && (inv || !(flags & DFT_COMPLEX_OUTPUT)))
            {
                ippiDFT_R_Func ippiFunc = inv ? (ippiDFT_R_Func)ippiDFTInv_PackToR_32f_C1R : (ippiDFT_R_Func)ippiDFTFwd_RToPack_32f_C1R;
-                if (Dft_R_IPPLoop(src,dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
+                if (Dft_R_IPPLoop(src, dst, IPPDFT_R_Functor(ippiFunc),ipp_norm_flag))
                    return;
                setIppErrorStatus();
            }
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -348,7 +348,18 @@ static void InvSqrt_32f(const float* src, float* dst, int len)

 static void InvSqrt_64f(const double* src, double* dst, int len)
 {
-    for( int i = 0; i < len; i++ )
+    int i = 0;
+
+#if CV_SSE2
+    if (USE_SSE2)
+    {
+        __m128d v_1 = _mm_set1_pd(1.0);
+        for ( ; i <= len - 2; i += 2)
+            _mm_storeu_pd(dst + i, _mm_div_pd(v_1, _mm_sqrt_pd(_mm_loadu_pd(src + i))));
+    }
+#endif
+
+    for( ; i < len; i++ )
        dst[i] = 1/std::sqrt(src[i]);
 }

@@ -2543,12 +2554,33 @@ void patchNaNs( InputOutputArray _a, double _val )
    NAryMatIterator it(arrays, (uchar**)ptrs);
    size_t len = it.size*a.channels();
    Cv32suf val;
-    val.f = (float)_val;
+    float fval = (float)_val;
+    val.f = fval;
+
+#if CV_SSE2
+    __m128i v_mask1 = _mm_set1_epi32(0x7fffffff), v_mask2 = _mm_set1_epi32(0x7f800000);
+    __m128i v_val = _mm_set1_epi32(val.i);
+#endif

    for( size_t i = 0; i < it.nplanes; i++, ++it )
    {
        int* tptr = ptrs[0];
-        for( size_t j = 0; j < len; j++ )
+        size_t j = 0;
+
+#if CV_SSE2
+        if (USE_SSE2)
+        {
+            for ( ; j < len; j += 4)
+            {
+                __m128i v_src = _mm_loadu_si128((__m128i const *)(tptr + j));
+                __m128i v_cmp_mask = _mm_cmplt_epi32(v_mask2, _mm_and_si128(v_src, v_mask1));
+                __m128i v_res = _mm_or_si128(_mm_andnot_si128(v_cmp_mask, v_src), _mm_and_si128(v_cmp_mask, v_val));
+                _mm_storeu_si128((__m128i *)(tptr + j), v_res);
+            }
+        }
+#endif
+
+        for( ; j < len; j++ )
            if( (tptr[j] & 0x7fffffff) > 0x7f800000 )
                tptr[j] = val.i;
    }
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -2758,21 +2758,30 @@ namespace cv {

 static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
 {
-    int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
+    int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = cn;
+    if (cn == 1)
+    {
+        kercn = std::min(ocl::predictOptimalVectorWidth(_m), 4);
+        if (kercn != 4)
+            kercn = 1;
+    }
+    int sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
            rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;

    ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
-                  format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
-                         ocl::memopTypeToStr(depth), cn, ocl::memopTypeToStr(sctype)));
+                  format("-D T=%s -D T1=%s -D cn=%d -D ST=%s -D kercn=%d -D rowsPerWI=%d",
+                         ocl::memopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
+                         ocl::memopTypeToStr(depth), cn,
+                         ocl::memopTypeToStr(sctype),
+                         kercn, rowsPerWI));
    if (k.empty())
        return false;

    UMat m = _m.getUMat();
-    k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)),
-           rowsPerWI);
+    k.args(ocl::KernelArg::WriteOnly(m, cn, kercn),
+           ocl::KernelArg::Constant(Mat(1, 1, sctype, s)));

-    size_t globalsize[2] = { m.cols, (m.rows + rowsPerWI - 1) / rowsPerWI };
+    size_t globalsize[2] = { m.cols * cn / kercn, (m.rows + rowsPerWI - 1) / rowsPerWI };
    return k.run(2, globalsize, NULL, false);
 }

@@ -3327,7 +3336,7 @@ static inline void reduceSumC_8u16u16s32f_64f(const cv::Mat& srcmat, cv::Mat& ds
            stype == CV_32FC3 ? (ippiSumHint)ippiSum_32f_C3R :
            stype == CV_32FC4 ? (ippiSumHint)ippiSum_32f_C4R : 0;
        func =
-        sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double,   cv::OpAdd<double> > :
+            sdepth == CV_8U ? (cv::ReduceFunc)cv::reduceC_<uchar, double,   cv::OpAdd<double> > :
            sdepth == CV_16U ? (cv::ReduceFunc)cv::reduceC_<ushort, double,   cv::OpAdd<double> > :
            sdepth == CV_16S ? (cv::ReduceFunc)cv::reduceC_<short, double,   cv::OpAdd<double> > :
            sdepth == CV_32F ? (cv::ReduceFunc)cv::reduceC_<float, double,   cv::OpAdd<double> > : 0;
@@ -3441,12 +3450,18 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
    const int min_opt_cols = 128, buf_cols = 32;
    int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
            ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth;
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
-            useOptimized = 1 == dim && _src.cols() > min_opt_cols;
+    const ocl::Device &defDev = ocl::Device::getDefault();
+    bool doubleSupport = defDev.doubleFPConfig() > 0;
+
+    size_t wgs = defDev.maxWorkGroupSize();
+    bool useOptimized = 1 == dim && _src.cols() > min_opt_cols && (wgs >= buf_cols);

    if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
        return false;

+    if ((op == CV_REDUCE_SUM && sdepth == CV_32F) || op == CV_REDUCE_MIN || op == CV_REDUCE_MAX)
+        return false;
+
    if (op == CV_REDUCE_AVG)
    {
        if (sdepth < CV_32S && ddepth < CV_32S)
@@ -3455,78 +3470,80 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,

    const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
                                  "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
-    char cvt[2][40];
-
    int wdepth = std::max(ddepth, CV_32F);
-    cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
-                                  " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
-                                  " -D convertToDT=%s -D convertToDT0=%s%s",
-                                  ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
-                                  ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
-                                  ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
-                                  ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
-                                  ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
-                                  doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
    if (useOptimized)
    {
-        cv::String build_opt_pre = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D %s -D dim=1"
-                                          "  -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
-                                          buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
-                                          ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
-                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-        ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre);
-        if (kpre.empty())
+        size_t tileHeight = (size_t)(wgs / buf_cols);
+        if (defDev.isIntel())
+        {
+            static const size_t maxItemInGroupCount = 16;
+            tileHeight = min(tileHeight, defDev.localMemSize() / buf_cols / CV_ELEM_SIZE(CV_MAKETYPE(wdepth, cn)) / maxItemInGroupCount);
+        }
+        char cvt[3][40];
+        cv::String build_opt = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D TILE_HEIGHT=%d -D %s -D dim=1"
+                                            " -D cn=%d -D ddepth=%d"
+                                            " -D srcT=%s -D bufT=%s -D dstT=%s"
+                                            " -D convertToWT=%s -D convertToBufT=%s -D convertToDT=%s%s",
+                                            buf_cols, tileHeight, ops[op], cn, ddepth,
+                                            ocl::typeToStr(sdepth),
+                                            ocl::typeToStr(ddepth),
+                                            ocl::typeToStr(ddepth0),
+                                            ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
+                                            ocl::convertTypeStr(sdepth, ddepth, 1, cvt[1]),
+                                            ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[2]),
+                                            doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+        ocl::Kernel k("reduce_horz_opt", ocl::core::reduce2_oclsrc, build_opt);
+        if (k.empty())
            return false;
-
-        ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt);
-        if (kmain.empty())
-            return false;
-
        UMat src = _src.getUMat();
        Size dsize(1, src.rows);
        _dst.create(dsize, dtype);
        UMat dst = _dst.getUMat();

-        UMat buf(src.rows, buf_cols, dst.type());
-
-        kpre.args(ocl::KernelArg::ReadOnly(src),
-                  ocl::KernelArg::WriteOnlyNoSize(buf));
+        if (op0 == CV_REDUCE_AVG)
+            k.args(ocl::KernelArg::ReadOnly(src),
+                      ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
+        else
+            k.args(ocl::KernelArg::ReadOnly(src),
+                      ocl::KernelArg::WriteOnlyNoSize(dst));

+        size_t localSize[2] = { buf_cols, tileHeight};
        size_t globalSize[2] = { buf_cols, src.rows };
-        if (!kpre.run(2, globalSize, NULL, false))
+        return k.run(2, globalSize, localSize, false);
+    }
+    else
+    {
+        char cvt[2][40];
+        cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
+                                      " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
+                                      " -D convertToDT=%s -D convertToDT0=%s%s",
+                                      ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
+                                      ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
+                                      ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
+                                      ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
+                                      ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
+                                      doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+
+        ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
+        if (k.empty())
            return false;

+        UMat src = _src.getUMat();
+        Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
+        _dst.create(dsize, dtype);
+        UMat dst = _dst.getUMat();
+
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
+                temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
+
        if (op0 == CV_REDUCE_AVG)
-            kmain.args(ocl::KernelArg::ReadOnly(buf),
-                       ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
+            k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
        else
-            kmain.args(ocl::KernelArg::ReadOnly(buf),
-                       ocl::KernelArg::WriteOnlyNoSize(dst));
+            k.args(srcarg, temparg);

-        globalSize[0] = src.rows;
-        return kmain.run(1, globalSize, NULL, false);
+        size_t globalsize = std::max(dsize.width, dsize.height);
+        return k.run(1, &globalsize, NULL, false);
    }
-
-    ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
-    if (k.empty())
-        return false;
-
-    UMat src = _src.getUMat();
-    Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
-    _dst.create(dsize, dtype);
-    UMat dst = _dst.getUMat();
-
-    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
-            temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
-
-    if (op0 == CV_REDUCE_AVG)
-        k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
-    else
-        k.args(srcarg, temparg);
-
-    size_t globalsize = std::max(dsize.width, dsize.height);
-    return k.run(1, &globalsize, NULL, false);
 }

 }
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -3494,9 +3494,8 @@ public:
    OpenCLBufferPoolImpl()
        : currentReservedSize(0), maxReservedSize(0)
    {
-        // Note: Buffer pool is disabled by default,
-        //       because we didn't receive significant performance improvement
-        maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", 0);
+        int poolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
+        maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", poolSize);
    }
    virtual ~OpenCLBufferPoolImpl()
    {
@@ -3739,6 +3738,7 @@ public:
                u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
                                           u->size, u->origdata, &retval);
                tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
+
            }
            if(!u->handle || retval != CL_SUCCESS)
                return false;
@@ -3880,6 +3880,7 @@ public:
                if(u->data && retval == CL_SUCCESS)
                {
                    u->markHostCopyObsolete(false);
+                    u->markDeviceMemMapped(true);
                    return;
                }

@@ -3908,6 +3909,7 @@ public:
        if(!u)
            return;

+
        CV_Assert(u->handle != 0);

        UMatDataAutoLock autolock(u);
@@ -3918,8 +3920,10 @@ public:

        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
        cl_int retval = 0;
-        if( !u->copyOnMap() && u->data )
+        if( !u->copyOnMap() && u->deviceMemMapped() )
        {
+            CV_Assert(u->data != NULL);
+            u->markDeviceMemMapped(false);
            CV_Assert( (retval = clEnqueueUnmapMemObject(q,
                                (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
            CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
--- a/modules/core/src/opencl/fft.cl
+++ b/modules/core/src/opencl/fft.cl
@@ -0,0 +1,864 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Itseez, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#define SQRT_2 0.707106781188f
+#define sin_120 0.866025403784f
+#define fft5_2  0.559016994374f
+#define fft5_3 -0.951056516295f
+#define fft5_4 -1.538841768587f
+#define fft5_5  0.363271264002f
+
+__attribute__((always_inline))
+float2 mul_float2(float2 a, float2 b) {
+    return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x));
+}
+
+__attribute__((always_inline))
+float2 twiddle(float2 a) {
+    return (float2)(a.y, -a.x);
+}
+
+__attribute__((always_inline))
+void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles,
+                const int x, const int block_size)
+{
+    const int k = x & (block_size - 1);
+    a1 = mul_float2(twiddles[k], a1);
+    const int dst_ind = (x << 1) - k;
+
+    smem[dst_ind] = a0 + a1;
+    smem[dst_ind+block_size] = a0 - a1;
+}
+
+__attribute__((always_inline))
+void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles,
+                const int x, const int block_size)
+{
+    const int k = x & (block_size - 1);
+    a1 = mul_float2(twiddles[k], a1);
+    a2 = mul_float2(twiddles[k + block_size], a2);
+    a3 = mul_float2(twiddles[k + 2*block_size], a3);
+
+    const int dst_ind = ((x - k) << 2) + k;
+
+    float2 b0 = a0 + a2;
+    a2 = a0 - a2;
+    float2 b1 = a1 + a3;
+    a3 = twiddle(a1 - a3);
+
+    smem[dst_ind]                = b0 + b1;
+    smem[dst_ind + block_size]   = a2 + a3;
+    smem[dst_ind + 2*block_size] = b0 - b1;
+    smem[dst_ind + 3*block_size] = a2 - a3;
+}
+
+__attribute__((always_inline))
+void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles,
+                const int x, const int block_size)
+{
+    const int k = x % block_size;
+    a1 = mul_float2(twiddles[k], a1);
+    a2 = mul_float2(twiddles[k+block_size], a2);
+    const int dst_ind = ((x - k) * 3) + k;
+
+    float2 b1 = a1 + a2;
+    a2 = twiddle(sin_120*(a1 - a2));
+    float2 b0 = a0 - (float2)(0.5f)*b1;
+
+    smem[dst_ind] = a0 + b1;
+    smem[dst_ind + block_size] = b0 + a2;
+    smem[dst_ind + 2*block_size] = b0 - a2;
+}
+
+__attribute__((always_inline))
+void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles,
+                const int x, const int block_size)
+{
+    const int k = x % block_size;
+    a1 = mul_float2(twiddles[k], a1);
+    a2 = mul_float2(twiddles[k + block_size], a2);
+    a3 = mul_float2(twiddles[k+2*block_size], a3);
+    a4 = mul_float2(twiddles[k+3*block_size], a4);
+
+    const int dst_ind = ((x - k) * 5) + k;
+    __local float2* dst = smem + dst_ind;
+
+    float2 b0, b1, b5;
+
+    b1 = a1 + a4;
+    a1 -= a4;
+
+    a4 = a3 + a2;
+    a3 -= a2;
+
+    a2 = b1 + a4;
+    b0 = a0 - (float2)0.25f * a2;
+
+    b1 = fft5_2 * (b1 - a4);
+    a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x);
+    b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x);
+
+    a4.x += fft5_4 * a3.y;
+    a4.y -= fft5_4 * a3.x;
+
+    a1 = b0 + b1;
+    b0 -= b1;
+
+    dst[0] = a0 + a2;
+    dst[block_size] = a1 + a4;
+    dst[2 * block_size] = b0 + b5;
+    dst[3 * block_size] = b0 - b5;
+    dst[4 * block_size] = a1 - a4;
+}
+
+__attribute__((always_inline))
+void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    float2 a0, a1;
+
+    if (x < t)
+    {
+        a0 = smem[x];
+        a1 = smem[x+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+        butterfly2(a0, a1, smem, twiddles, x, block_size);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/2;
+    float2 a0, a1, a2, a3;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t];
+        a2 = smem[x2]; a3 = smem[x2+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly2(a0, a1, smem, twiddles, x1, block_size);
+        butterfly2(a2, a3, smem, twiddles, x2, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/3;
+    const int x3 = x1 + 2*t/3;
+    float2 a0, a1, a2, a3, a4, a5;
+
+    if (x1 < t/3)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t];
+        a2 = smem[x2]; a3 = smem[x2+t];
+        a4 = smem[x3]; a5 = smem[x3+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/3)
+    {
+        butterfly2(a0, a1, smem, twiddles, x1, block_size);
+        butterfly2(a2, a3, smem, twiddles, x2, block_size);
+        butterfly2(a4, a5, smem, twiddles, x3, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int thread_block = t/4;
+    const int x2 = x1 + thread_block;
+    const int x3 = x1 + 2*thread_block;
+    const int x4 = x1 + 3*thread_block;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7;
+
+    if (x1 < t/4)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t];
+        a2 = smem[x2]; a3 = smem[x2+t];
+        a4 = smem[x3]; a5 = smem[x3+t];
+        a6 = smem[x4]; a7 = smem[x4+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/4)
+    {
+        butterfly2(a0, a1, smem, twiddles, x1, block_size);
+        butterfly2(a2, a3, smem, twiddles, x2, block_size);
+        butterfly2(a4, a5, smem, twiddles, x3, block_size);
+        butterfly2(a6, a7, smem, twiddles, x4, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int thread_block = t/5;
+    const int x2 = x1 + thread_block;
+    const int x3 = x1 + 2*thread_block;
+    const int x4 = x1 + 3*thread_block;
+    const int x5 = x1 + 4*thread_block;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
+
+    if (x1 < t/5)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t];
+        a2 = smem[x2]; a3 = smem[x2+t];
+        a4 = smem[x3]; a5 = smem[x3+t];
+        a6 = smem[x4]; a7 = smem[x4+t];
+        a8 = smem[x5]; a9 = smem[x5+t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/5)
+    {
+        butterfly2(a0, a1, smem, twiddles, x1, block_size);
+        butterfly2(a2, a3, smem, twiddles, x2, block_size);
+        butterfly2(a4, a5, smem, twiddles, x3, block_size);
+        butterfly2(a6, a7, smem, twiddles, x4, block_size);
+        butterfly2(a8, a9, smem, twiddles, x5, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix4(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    float2 a0, a1, a2, a3;
+
+    if (x < t)
+    {
+        a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+        butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix4_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/2;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
+        a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
+        butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix4_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/3;
+    const int x3 = x2 + t/3;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
+
+    if (x1 < t/3)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
+        a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
+        a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/3)
+    {
+        butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
+        butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
+        butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix8(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    const int k = x % block_size;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7;
+
+    if (x < t)
+    {
+        int tw_ind = block_size / 8;
+
+        a0 = smem[x];
+        a1 = mul_float2(twiddles[k], smem[x + t]);
+        a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]);
+        a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]);
+        a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]);
+        a5 = mul_float2(twiddles[k+4*block_size],smem[x+5*t]);
+        a6 = mul_float2(twiddles[k+5*block_size],smem[x+6*t]);
+        a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]);
+
+        float2 b0, b1, b6, b7;
+
+        b0 = a0 + a4;
+        a4 = a0 - a4;
+        b1 = a1 + a5;
+        a5 = a1 - a5;
+        a5 = (float2)(SQRT_2) * (float2)(a5.x + a5.y, -a5.x + a5.y);
+        b6 = twiddle(a2 - a6);
+        a2 = a2 + a6;
+        b7 = a3 - a7;
+        b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y);
+        a3 = a3 + a7;
+
+        a0 = b0 + a2;
+        a2 = b0 - a2;
+        a1 = b1 + a3;
+        a3 = twiddle(b1 - a3);
+        a6 = a4 - b6;
+        a4 = a4 + b6;
+        a7 = twiddle(a5 - b7);
+        a5 = a5 + b7;
+
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+    {
+        const int dst_ind = ((x - k) << 3) + k;
+        __local float2* dst = smem + dst_ind;
+
+        dst[0] = a0 + a1;
+        dst[block_size] = a4 + a5;
+        dst[2 * block_size] = a2 + a3;
+        dst[3 * block_size] = a6 + a7;
+        dst[4 * block_size] = a0 - a1;
+        dst[5 * block_size] = a4 - a5;
+        dst[6 * block_size] = a2 - a3;
+        dst[7 * block_size] = a6 - a7;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix3(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    float2 a0, a1, a2;
+
+    if (x < t)
+    {
+        a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+        butterfly3(a0, a1, a2, smem, twiddles, x, block_size);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix3_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/2;
+    float2 a0, a1, a2, a3, a4, a5;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
+        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
+        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1 + t/3;
+    const int x3 = x2 + t/3;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
+        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
+        a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
+        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
+        butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix3_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int thread_block = t/4;
+    const int x2 = x1 + thread_block;
+    const int x3 = x1 + 2*thread_block;
+    const int x4 = x1 + 3*thread_block;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
+
+    if (x1 < t/4)
+    {
+        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
+        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
+        a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
+        a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/4)
+    {
+        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
+        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
+        butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
+        butterfly3(a9, a10, a11, smem, twiddles, x4, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix5(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
+{
+    const int k = x % block_size;
+    float2 a0, a1, a2, a3, a4;
+
+    if (x < t)
+    {
+        a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < t)
+        butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((always_inline))
+void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
+{
+    const int x2 = x1+t/2;
+    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
+
+    if (x1 < t/2)
+    {
+        a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t];
+        a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x1 < t/2)
+    {
+        butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size);
+        butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+#ifdef DFT_SCALE
+#define SCALE_VAL(x, scale) x*scale
+#else
+#define SCALE_VAL(x, scale) x
+#endif
+
+__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
+                                   __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                   __global float2* twiddles_ptr, const int t, const int nz)
+{
+    const int x = get_global_id(0);
+    const int y = get_group_id(1);
+    const int block_size = LOCAL_SIZE/kercn;
+    if (y < nz)
+    {
+        __local float2 smem[LOCAL_SIZE];
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = x;
+#ifdef IS_1D
+        float scale = 1.f/dst_cols;
+#else
+        float scale = 1.f/(dst_cols*dst_rows);
+#endif
+
+#ifdef COMPLEX_INPUT
+        __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+            smem[x+i*block_size] = src[i*block_size];
+#else
+        __global const float* src = (__global const float*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(float), src_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+            smem[x+i*block_size] = (float2)(src[i*block_size], 0.f);
+#endif
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+#ifdef COMPLEX_OUTPUT
+#ifdef NO_CONJUGATE
+        // copy result without complex conjugate
+        const int cols = dst_cols/2 + 1;
+#else
+        const int cols = dst_cols;
+#endif
+
+        __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
+        #pragma unroll
+        for (int i=x; i<cols; i+=block_size)
+            dst[i] = SCALE_VAL(smem[i], scale);
+#else
+        // pack row to CCS
+        __local float* smem_1cn = (__local float*) smem;
+        __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
+        for (int i=x; i<dst_cols-1; i+=block_size)
+            dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale);
+        if (x == 0)
+            dst[0] = SCALE_VAL(smem_1cn[0], scale);
+#endif
+    }
+    else
+    {
+        // fill with zero other rows
+#ifdef COMPLEX_OUTPUT
+        __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
+#else
+        __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
+#endif
+        #pragma unroll
+        for (int i=x; i<dst_cols; i+=block_size)
+            dst[i] = 0.f;
+    }
+}
+
+__kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
+                                   __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                   __global float2* twiddles_ptr, const int t, const int nz)
+{
+    const int x = get_group_id(0);
+    const int y = get_global_id(1);
+
+    if (x < nz)
+    {
+        __local float2 smem[LOCAL_SIZE];
+        __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset));
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = y;
+        const int block_size = LOCAL_SIZE/kercn;
+        float scale = 1.f/(dst_rows*dst_cols);
+
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+            smem[y+i*block_size] = *((__global const float2*)(src + i*block_size*src_step));
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+#ifdef COMPLEX_OUTPUT
+        __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+            *((__global float2*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale);
+#else
+        if (x == 0)
+        {
+            // pack first column to CCS
+            __local float* smem_1cn = (__local float*) smem;
+            __global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset);
+            for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
+                *((__global float*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
+            if (y == 0)
+                *((__global float*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale);
+        }
+        else if (x == (dst_cols+1)/2)
+        {
+            // pack last column to CCS (if needed)
+            __local float* smem_1cn = (__local float*) smem;
+            __global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(float), mad24(y+1, dst_step, dst_offset));
+            for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
+                *((__global float*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
+            if (y == 0)
+                *((__global float*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(float), dst_offset))) = SCALE_VAL(smem_1cn[0], scale);
+        }
+        else
+        {
+            __global uchar* dst = dst_ptr + mad24(x, (int)sizeof(float)*2, mad24(y, dst_step, dst_offset - (int)sizeof(float)));
+            #pragma unroll
+            for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step)
+                vstore2(SCALE_VAL(smem[i], scale), 0, (__global float*) dst);
+        }
+#endif
+    }
+}
+
+__kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
+                                    __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                    __global float2* twiddles_ptr, const int t, const int nz)
+{
+    const int x = get_global_id(0);
+    const int y = get_group_id(1);
+    const int block_size = LOCAL_SIZE/kercn;
+#ifdef IS_1D
+    const float scale = 1.f/dst_cols;
+#else
+    const float scale = 1.f/(dst_cols*dst_rows);
+#endif
+
+    if (y < nz)
+    {
+        __local float2 smem[LOCAL_SIZE];
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = x;
+
+#if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE)
+        __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            smem[x+i*block_size].x =  src[i*block_size].x;
+            smem[x+i*block_size].y = -src[i*block_size].y;
+        }
+#else
+
+    #if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
+        __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(float), src_offset)));
+
+        #pragma unroll
+        for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
+        {
+            smem[i+1].x = src[i].x;
+            smem[i+1].y = -src[i].y;
+            smem[LOCAL_SIZE-i-1] = src[i];
+        }
+    #else
+
+        #pragma unroll
+        for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
+        {
+            float2 src = vload2(0, (__global const float*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(float), src_offset))));
+
+            smem[i+1].x = src.x;
+            smem[i+1].y = -src.y;
+            smem[LOCAL_SIZE-i-1] = src;
+        }
+
+    #endif
+
+        if (x==0)
+        {
+            smem[0].x = *(__global const float*)(src_ptr + mad24(y, src_step, src_offset));
+            smem[0].y = 0.f;
+
+            if(LOCAL_SIZE % 2 ==0)
+            {
+                #if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
+                smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;
+                #else
+                smem[LOCAL_SIZE/2].x = *(__global const float*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(float), src_offset)));
+                #endif
+                smem[LOCAL_SIZE/2].y = 0.f;
+            }
+        }
+#endif
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+        // copy data to dst
+#ifdef COMPLEX_OUTPUT
+        __global float2* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale);
+            dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale);
+        }
+#else
+        __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)), dst_offset)));
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale);
+        }
+#endif
+    }
+    else
+    {
+        // fill with zero other rows
+#ifdef COMPLEX_OUTPUT
+        __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
+#else
+        __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
+#endif
+        #pragma unroll
+        for (int i=x; i<dst_cols; i+=block_size)
+            dst[i] = 0.f;
+    }
+}
+
+__kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
+                              __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                              __global float2* twiddles_ptr, const int t, const int nz)
+{
+    const int x = get_group_id(0);
+    const int y = get_global_id(1);
+
+#ifdef COMPLEX_INPUT
+    if (x < nz)
+    {
+        __local float2 smem[LOCAL_SIZE];
+        __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset));
+        __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset));
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = y;
+        const int block_size = LOCAL_SIZE/kercn;
+
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            float2 temp = *((__global const float2*)(src + i*block_size*src_step));
+            smem[y+i*block_size].x =  temp.x;
+            smem[y+i*block_size].y =  -temp.y;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+        // copy data to dst
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+           __global float2* res = (__global float2*)(dst + i*block_size*dst_step);
+            res[0].x = smem[y + i*block_size].x;
+            res[0].y = -smem[y + i*block_size].y;
+        }
+    }
+#else
+    if (x < nz)
+    {
+        __global const float2* twiddles = (__global float2*) twiddles_ptr;
+        const int ind = y;
+        const int block_size = LOCAL_SIZE/kercn;
+
+        __local float2 smem[LOCAL_SIZE];
+#ifdef EVEN
+        if (x!=0 && (x!=(nz-1)))
+#else
+        if (x!=0)
+#endif
+        {
+            __global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(float), src_offset));
+            #pragma unroll
+            for (int i=0; i<kercn; i++)
+            {
+                float2 temp = vload2(0, (__global const float*)(src + i*block_size*src_step));
+                smem[y+i*block_size].x = temp.x;
+                smem[y+i*block_size].y = -temp.y;
+            }
+        }
+        else
+        {
+            int ind = x==0 ? 0: 2*x-1;
+            __global const float* src = (__global const float*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(float), src_offset)));
+            int step = src_step/(int)sizeof(float);
+
+            #pragma unroll
+            for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size)
+            {
+                smem[i+1].x = src[2*i*step];
+                smem[i+1].y = -src[(2*i+1)*step];
+
+                smem[LOCAL_SIZE-i-1].x = src[2*i*step];;
+                smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step];
+            }
+            if (y==0)
+            {
+                smem[0].x = *(__global const float*)(src_ptr + mad24(ind, (int)sizeof(float), src_offset));
+                smem[0].y = 0.f;
+
+                if(LOCAL_SIZE % 2 ==0)
+                {
+                    smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step];
+                    smem[LOCAL_SIZE/2].y = 0.f;
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        RADIX_PROCESS;
+
+        // copy data to dst
+        __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float2)), dst_offset));
+
+        #pragma unroll
+        for (int i=0; i<kercn; i++)
+        {
+            __global float2* res = (__global float2*)(dst + i*block_size*dst_step);
+            res[0].x =  smem[y + i*block_size].x;
+            res[0].y = -smem[y + i*block_size].y;
+        }
+    }
+#endif
+}
--- a/modules/core/src/opencl/lut.cl
+++ b/modules/core/src/opencl/lut.cl
@@ -36,114 +36,118 @@

 #if lcn == 1
    #if dcn == 4
-        #define LUT_OP(num)\
-            int idx = *(__global const int *)(srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx & 0xff];\
-            dst[1] = lut_l[(idx >> 8) & 0xff];\
-            dst[2] = lut_l[(idx >> 16) & 0xff];\
+        #define LUT_OP  \
+            int idx = *(__global const int *)(srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx & 0xff]; \
+            dst[1] = lut_l[(idx >> 8) & 0xff]; \
+            dst[2] = lut_l[(idx >> 16) & 0xff]; \
            dst[3] = lut_l[(idx >> 24) & 0xff];
    #elif dcn == 3
-        #define LUT_OP(num)\
-            uchar3 idx = vload3(0, srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx.x];\
-            dst[1] = lut_l[idx.y];\
+        #define LUT_OP  \
+            uchar3 idx = vload3(0, srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx.x]; \
+            dst[1] = lut_l[idx.y]; \
            dst[2] = lut_l[idx.z];
    #elif dcn == 2
-        #define LUT_OP(num)\
-            short idx = *(__global const short *)(srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx & 0xff];\
+        #define LUT_OP \
+            short idx = *(__global const short *)(srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx & 0xff]; \
            dst[1] = lut_l[(idx >> 8) & 0xff];
    #elif dcn == 1
-        #define LUT_OP(num)\
-            uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
+        #define LUT_OP \
+            uchar idx = (srcptr + src_index)[0]; \
+            dst = (__global dstT *)(dstptr + dst_index); \
            dst[0] = lut_l[idx];
    #else
-        #define LUT_OP(num)\
-            __global const srcT * src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            for (int cn = 0; cn < dcn; ++cn)\
+        #define LUT_OP \
+            __global const srcT * src = (__global const srcT *)(srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            for (int cn = 0; cn < dcn; ++cn) \
                dst[cn] = lut_l[src[cn]];
    #endif
 #else
    #if dcn == 4
-        #define LUT_OP(num)\
-            __global const uchar4 *src_pixel = (__global const uchar4 *)(srcptr + mad24(num, src_step, src_index));\
-            int4 idx = convert_int4(src_pixel[0]) * lcn + (int4)(0, 1, 2, 3);\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx.x];\
-            dst[1] = lut_l[idx.y];\
-            dst[2] = lut_l[idx.z];\
+        #define LUT_OP \
+            __global const uchar4 * src_pixel = (__global const uchar4 *)(srcptr + src_index); \
+            int4 idx = mad24(convert_int4(src_pixel[0]), (int4)(lcn), (int4)(0, 1, 2, 3)); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx.x]; \
+            dst[1] = lut_l[idx.y]; \
+            dst[2] = lut_l[idx.z]; \
            dst[3] = lut_l[idx.w];
    #elif dcn == 3
-        #define LUT_OP(num)\
-            uchar3 src_pixel = vload3(0, srcptr + mad24(num, src_step, src_index));\
-            int3 idx = convert_int3(src_pixel) * lcn + (int3)(0, 1, 2);\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx.x];\
-            dst[1] = lut_l[idx.y];\
+        #define LUT_OP \
+            uchar3 src_pixel = vload3(0, srcptr + src_index); \
+            int3 idx = mad24(convert_int3(src_pixel), (int3)(lcn), (int3)(0, 1, 2)); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx.x]; \
+            dst[1] = lut_l[idx.y]; \
            dst[2] = lut_l[idx.z];
    #elif dcn == 2
-        #define LUT_OP(num)\
-            __global const uchar2 *src_pixel = (__global const uchar2 *)(srcptr + mad24(num, src_step, src_index));\
-            int2 idx = convert_int2(src_pixel[0]) * lcn + (int2)(0, 1);\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            dst[0] = lut_l[idx.x];\
+        #define LUT_OP \
+            __global const uchar2 * src_pixel = (__global const uchar2 *)(srcptr + src_index); \
+            int2 idx = mad24(convert_int2(src_pixel[0]), lcn, (int2)(0, 1)); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            dst[0] = lut_l[idx.x]; \
            dst[1] = lut_l[idx.y];
    #elif dcn == 1 //error case (1 < lcn) ==> lcn == scn == dcn
-        #define LUT_OP(num)\
-            uchar idx = (srcptr + mad24(num, src_step, src_index))[0];\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
+        #define LUT_OP \
+            uchar idx = (srcptr + src_index)[0]; \
+            dst = (__global dstT *)(dstptr + dst_index); \
            dst[0] = lut_l[idx];
    #else
-        #define LUT_OP(num)\
-            __global const srcT *src = (__global const srcT *)(srcptr + mad24(num, src_step, src_index));\
-            dst = (__global dstT *)(dstptr + mad24(num, dst_step, dst_index));\
-            for (int cn = 0; cn < dcn; ++cn)\
+        #define LUT_OP \
+            __global const srcT * src = (__global const srcT *)(srcptr + src_index); \
+            dst = (__global dstT *)(dstptr + dst_index); \
+            for (int cn = 0; cn < dcn; ++cn) \
                dst[cn] = lut_l[mad24(src[cn], lcn, cn)];
    #endif
 #endif

-#define LOCAL_LUT_INIT\
-    {\
-        __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);\
-        int init = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0));\
-        int step = get_local_size(0) * get_local_size(1);\
-        for (int i = init; i < 256 * lcn; i += step)\
-        {\
-            lut_l[i] = lut[i];\
-        }\
-        barrier(CLK_LOCAL_MEM_FENCE);\
-    }
-
 __kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,
                  __global const uchar * lutptr, int lut_step, int lut_offset,
                  __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)
 {
-    __local dstT lut_l[256 * lcn];
-    LOCAL_LUT_INIT;
-
    int x = get_global_id(0);
-    int y = 4 * get_global_id(1);
+    int y = get_global_id(1) << 2;
+
+    __local dstT lut_l[256 * lcn];
+    __global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);
+
+    for (int i = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0)),
+             step = get_local_size(0) * get_local_size(1); i < 256 * lcn; i += step)
+        lut_l[i] = lut[i];
+    barrier(CLK_LOCAL_MEM_FENCE);

    if (x < cols && y < rows)
    {
        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));
        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));
+
        __global dstT * dst;
-        LUT_OP(0);
+
+        LUT_OP;
+
        if (y < rows - 1)
        {
-            LUT_OP(1);
+            src_index += src_step;
+            dst_index += dst_step;
+            LUT_OP;
+
            if (y < rows - 2)
            {
-                LUT_OP(2);
+                src_index += src_step;
+                dst_index += dst_step;
+                LUT_OP;
+
                if (y < rows - 3)
                {
-                    LUT_OP(3);
+                    src_index += src_step;
+                    dst_index += dst_step;
+                    LUT_OP;
                }
            }
        }
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -42,9 +42,13 @@
 #if wdepth <= 4
 #define MIN_ABS(a) convertFromU(abs(a))
 #define MIN_ABS2(a, b) convertFromU(abs_diff(a, b))
+#define MIN(a, b) min(a, b)
+#define MAX(a, b) max(a, b)
 #else
 #define MIN_ABS(a) fabs(a)
 #define MIN_ABS2(a, b) fabs(a - b)
+#define MIN(a, b) fmin(a, b)
+#define MAX(a, b) fmax(a, b)
 #endif

 #if kercn != 3
@@ -60,44 +64,41 @@
 #define srcTSIZE (int)sizeof(srcT1)
 #endif

-#ifdef NEED_MINLOC
-#define CALC_MINLOC(inc) minloc = id + inc
-#else
-#define CALC_MINLOC(inc)
-#endif
-
-#ifdef NEED_MAXLOC
-#define CALC_MAXLOC(inc) maxloc = id + inc
-#else
-#define CALC_MAXLOC(inc)
-#endif
-
 #ifdef NEED_MINVAL
+#ifdef NEED_MINLOC
 #define CALC_MIN(p, inc) \
    if (minval > temp.p) \
    { \
        minval = temp.p; \
-        CALC_MINLOC(inc); \
+        minloc = id + inc; \
    }
 #else
+#define CALC_MIN(p, inc) \
+    minval = MIN(minval, temp.p);
+#endif
+#else
 #define CALC_MIN(p, inc)
 #endif

 #ifdef NEED_MAXVAL
+#ifdef NEED_MAXLOC
 #define CALC_MAX(p, inc) \
    if (maxval < temp.p) \
    { \
        maxval = temp.p; \
-        CALC_MAXLOC(inc); \
+        maxloc = id + inc; \
    }
 #else
+#define CALC_MAX(p, inc) \
+    maxval = MAX(maxval, temp.p);
+#endif
+#else
 #define CALC_MAX(p, inc)
 #endif

 #ifdef OP_CALC2
 #define CALC_MAX2(p) \
-    if (maxval2 < temp.p) \
-        maxval2 = temp.p;
+    maxval2 = MAX(maxval2, temp.p);
 #else
 #define CALC_MAX2(p)
 #endif
@@ -208,25 +209,28 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off

 #if kercn == 1
 #ifdef NEED_MINVAL
+#ifdef NEED_MINLOC
            if (minval > temp)
            {
                minval = temp;
-#ifdef NEED_MINLOC
                minloc = id;
-#endif
            }
+#else
+            minval = MIN(minval, temp);
+#endif
 #endif
 #ifdef NEED_MAXVAL
+#ifdef NEED_MAXLOC
            if (maxval < temp)
            {
                maxval = temp;
-#ifdef NEED_MAXLOC
                maxloc = id;
-#endif
            }
+#else
+            maxval = MAX(maxval, temp);
+#endif
 #ifdef OP_CALC2
-            if (maxval2 < temp2)
-                maxval2 = temp2;
+            maxval2 = MAX(maxval2, temp2);
 #endif
 #endif
 #elif kercn >= 2
@@ -282,32 +286,35 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
    {
        int lid3 = lid - WGS2_ALIGNED;
 #ifdef NEED_MINVAL
+#ifdef NEED_MINLOC
        if (localmem_min[lid3] >= minval)
        {
-#ifdef NEED_MINLOC
            if (localmem_min[lid3] == minval)
                localmem_minloc[lid3] = min(localmem_minloc[lid3], minloc);
            else
                localmem_minloc[lid3] = minloc,
-#endif
-                localmem_min[lid3] = minval;
+            localmem_min[lid3] = minval;
        }
+#else
+        localmem_min[lid3] = MIN(localmem_min[lid3], minval);
+#endif
 #endif
 #ifdef NEED_MAXVAL
+#ifdef NEED_MAXLOC
        if (localmem_max[lid3] <= maxval)
        {
-#ifdef NEED_MAXLOC
            if (localmem_max[lid3] == maxval)
                localmem_maxloc[lid3] = min(localmem_maxloc[lid3], maxloc);
            else
                localmem_maxloc[lid3] = maxloc,
-#endif
-                localmem_max[lid3] = maxval;
+            localmem_max[lid3] = maxval;
        }
+#else
+        localmem_max[lid3] = MAX(localmem_max[lid3], maxval);
+#endif
 #endif
 #ifdef OP_CALC2
-        if (localmem_max2[lid3] < maxval2)
-            localmem_max2[lid3] = maxval2;
+        localmem_max2[lid3] = MAX(localmem_max2[lid3], maxval2);
 #endif
    }
    barrier(CLK_LOCAL_MEM_FENCE);
@@ -319,32 +326,35 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
            int lid2 = lsize + lid;

 #ifdef NEED_MINVAL
+#ifdef NEED_MINLOC
            if (localmem_min[lid] >= localmem_min[lid2])
            {
-#ifdef NEED_MINLOC
                if (localmem_min[lid] == localmem_min[lid2])
                    localmem_minloc[lid] = min(localmem_minloc[lid2], localmem_minloc[lid]);
                else
                    localmem_minloc[lid] = localmem_minloc[lid2],
-#endif
-                    localmem_min[lid] = localmem_min[lid2];
+                localmem_min[lid] = localmem_min[lid2];
            }
+#else
+            localmem_min[lid] = MIN(localmem_min[lid], localmem_min[lid2]);
+#endif
 #endif
 #ifdef NEED_MAXVAL
+#ifdef NEED_MAXLOC
            if (localmem_max[lid] <= localmem_max[lid2])
            {
-#ifdef NEED_MAXLOC
                if (localmem_max[lid] == localmem_max[lid2])
                    localmem_maxloc[lid] = min(localmem_maxloc[lid2], localmem_maxloc[lid]);
                else
                    localmem_maxloc[lid] = localmem_maxloc[lid2],
-#endif
-                    localmem_max[lid] = localmem_max[lid2];
+                localmem_max[lid] = localmem_max[lid2];
            }
+#else
+            localmem_max[lid] = MAX(localmem_max[lid], localmem_max[lid2]);
+#endif
 #endif
 #ifdef OP_CALC2
-            if (localmem_max2[lid] < localmem_max2[lid2])
-                localmem_max2[lid] = localmem_max2[lid2];
+            localmem_max2[lid] = MAX(localmem_max2[lid], localmem_max2[lid2]);
 #endif
        }
        barrier(CLK_LOCAL_MEM_FENCE);
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -379,7 +379,7 @@
 #define REDUCE_GLOBAL \
    dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
    dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = SUM_ABS2(temp, temp2)); \
+    temp = SUM_ABS2(temp, temp2); \
    FUNC(accumulator, temp.s0); \
    FUNC(accumulator, temp.s1); \
    FUNC(accumulator, temp.s2); \
--- a/modules/core/src/opencl/reduce2.cl
+++ b/modules/core/src/opencl/reduce2.cl
@@ -81,29 +81,34 @@
 #define PROCESS_ELEM(acc, value) acc += value
 #elif defined OCL_CV_REDUCE_MAX
 #define INIT_VALUE MIN_VAL
-#define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc
+#define PROCESS_ELEM(acc, value) acc = max(value, acc)
 #elif defined OCL_CV_REDUCE_MIN
 #define INIT_VALUE MAX_VAL
-#define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc
+#define PROCESS_ELEM(acc, value) acc = min(value, acc)
 #else
 #error "No operation is specified"
 #endif

 #ifdef OP_REDUCE_PRE

-__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
-                     __global uchar * bufptr, int buf_step, int buf_offset)
+__kernel void reduce_horz_opt(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
+                     __global uchar * dstptr, int dst_step, int dst_offset
+#ifdef OCL_CV_REDUCE_AVG
+                     , float fscale
+#endif
+                     )
 {
+    __local bufT lsmem[TILE_HEIGHT][BUF_COLS][cn];
+
    int x = get_global_id(0);
    int y = get_global_id(1);
-    if (x < BUF_COLS)
+    int liy = get_local_id(1);
+    if ((x < BUF_COLS) && (y < rows))
    {
        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset));
-        int buf_index = mad24(y, buf_step, mad24(x, (int)sizeof(dstT) * cn, buf_offset));

        __global const srcT * src = (__global const srcT *)(srcptr + src_index);
-        __global dstT * buf = (__global dstT *)(bufptr + buf_index);
-        dstT tmp[cn] = { INIT_VALUE };
+        bufT tmp[cn] = { INIT_VALUE };

        int src_step_mul = BUF_COLS * cn;
        for (int idx = x; idx < cols; idx += BUF_COLS, src += src_step_mul)
@@ -111,14 +116,49 @@ __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int s
            #pragma unroll
            for (int c = 0; c < cn; ++c)
            {
-                dstT value = convertToDT(src[c]);
+                bufT value = convertToBufT(src[c]);
                PROCESS_ELEM(tmp[c], value);
            }
        }

        #pragma unroll
        for (int c = 0; c < cn; ++c)
-            buf[c] = tmp[c];
+            lsmem[liy][x][c] = tmp[c];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if ((x < BUF_COLS / 2) && (y < rows))
+    {
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+        {
+            PROCESS_ELEM(lsmem[liy][x][c], lsmem[liy][x +  BUF_COLS / 2][c]);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if ((x == 0) && (y < rows))
+    {
+        int dst_index = mad24(y, dst_step, dst_offset);
+
+        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+        bufT tmp[cn] = { INIT_VALUE };
+
+        #pragma unroll
+        for (int xin = 0; xin < BUF_COLS / 2; xin ++)
+        {
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+            {
+                PROCESS_ELEM(tmp[c], lsmem[liy][xin][c]);
+            }
+        }
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+#ifdef OCL_CV_REDUCE_AVG
+            dst[c] = convertToDT(convertToWT(tmp[c]) * fscale);
+#else
+            dst[c] = convertToDT(tmp[c]);
+#endif
    }
 }

--- a/modules/core/src/opencl/set_identity.cl
+++ b/modules/core/src/opencl/set_identity.cl
@@ -43,20 +43,18 @@
 //
 //M*/

-#if cn != 3
-#define loadpix(addr) *(__global const T *)(addr)
+#if kercn != 3
 #define storepix(val, addr)  *(__global T *)(addr) = val
 #define TSIZE (int)sizeof(T)
 #define scalar scalar_
 #else
-#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
 #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
 #define TSIZE ((int)sizeof(T1)*3)
 #define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
 #endif

 __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
-                          ST scalar_, int rowsPerWI)
+                          ST scalar_)
 {
    int x = get_global_id(0);
    int y0 = get_global_id(1) * rowsPerWI;
@@ -65,7 +63,35 @@ __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset,
    {
        int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));

-        for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
-            storepix(x == y ? scalar : (T)(0), srcptr + src_index);
+#if kercn == cn
+        #pragma unroll
+        for (int y = y0, i = 0, y1 = min(rows, y0 + rowsPerWI); i < rowsPerWI; ++y, ++i, src_index += src_step)
+            if (y < y1)
+                storepix(x == y ? scalar : (T)(0), srcptr + src_index);
+#elif kercn == 4 && cn == 1
+        if (y0 < rows)
+        {
+            storepix(x == y0 >> 2 ? (T)(scalar, 0, 0, 0) : (T)(0), srcptr + src_index);
+            if (++y0 < rows)
+            {
+                src_index += src_step;
+                storepix(x == y0 >> 2 ? (T)(0, scalar, 0, 0) : (T)(0), srcptr + src_index);
+
+                if (++y0 < rows)
+                {
+                    src_index += src_step;
+                    storepix(x == y0 >> 2 ? (T)(0, 0, scalar, 0) : (T)(0), srcptr + src_index);
+
+                    if (++y0 < rows)
+                    {
+                        src_index += src_step;
+                        storepix(x == y0 >> 2 ? (T)(0, 0, 0, scalar) : (T)(0), srcptr + src_index);
+                    }
+                }
+            }
+        }
+#else
+#error "Incorrect combination of cn && kercn"
+#endif
    }
 }
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -479,9 +479,10 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
        haveMask = _mask.kind() != _InputArray::NONE,
        haveSrc2 = _src2.kind() != _InputArray::NONE;
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
-            kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1,
+            kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1,
            mcn = std::max(cn, kercn);
    CV_Assert(!haveSrc2 || _src2.type() == type);
+    int convert_cn = haveSrc2 ? mcn : cn;

    if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
        return false;
@@ -513,7 +514,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
                         haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
                         haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
-                         depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, mcn, cvt[1]) : "noconvert");
+                         depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert");

    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
    if (k.empty())
@@ -918,8 +919,14 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv
        int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
        bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
                isContinuous = _src.isContinuous();
-        int groups = ocl::Device::getDefault().maxComputeUnits();
-        size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+        const ocl::Device &defDev = ocl::Device::getDefault();
+        int groups = defDev.maxComputeUnits();
+        if (defDev.isIntel())
+        {
+            static const int subSliceEUCount = 10;
+            groups = (groups / subSliceEUCount) * 2;
+        }
+        size_t wgs = defDev.maxWorkGroupSize();

        int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth),
                dtype = CV_MAKE_TYPE(ddepth, cn),
@@ -1445,6 +1452,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*

    CV_Assert(!haveSrc2 || _src2.type() == type);

+    if (depth == CV_32S || depth == CV_32F)
+        return false;
+
    if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport)
        return false;

@@ -2178,6 +2188,9 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
         (!doubleSupport && depth == CV_64F))
        return false;

+    if( depth == CV_32F && (!_mask.empty() || normType == NORM_INF) )
+        return false;
+
    UMat src = _src.getUMat();

    if (normType == NORM_INF)
@@ -2270,7 +2283,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )

                setIppErrorStatus();
            }
-            typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
+            /*typedef IppStatus (CV_STDCALL* ippiMaskNormFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
            ippiMaskNormFuncC3 ippFuncC3 =
                normType == NORM_INF ?
                (type == CV_8UC3 ? (ippiMaskNormFuncC3)ippiNorm_Inf_8u_C3CMR :
@@ -2305,7 +2318,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
                    return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
                }
                setIppErrorStatus();
-            }
+            }*/
        }
        else
        {
@@ -2533,7 +2546,7 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
    normType &= ~NORM_RELATIVE;
    bool normsum = normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR;

-    if ( !(normType == NORM_INF || normsum) )
+    if ( !normsum || !_mask.empty() )
        return false;

    if (normsum)
@@ -2711,7 +2724,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
                0) :
                normType == NORM_L1 ?
                (type == CV_8UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8u_C1MR :
-                type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
+                //type == CV_8SC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_8s_C1MR :
                type == CV_16UC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_16u_C1MR :
                type == CV_32FC1 ? (ippiMaskNormDiffFuncC1)ippiNormDiff_L1_32f_C1MR :
                0) :
@@ -2728,7 +2741,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
                    return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
                setIppErrorStatus();
            }
-            typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
+            /*typedef IppStatus (CV_STDCALL* ippiMaskNormDiffFuncC3)(const void *, int, const void *, int, const void *, int, IppiSize, int, Ipp64f *);
            ippiMaskNormDiffFuncC3 ippFuncC3 =
                normType == NORM_INF ?
                (type == CV_8UC3 ? (ippiMaskNormDiffFuncC3)ippiNormDiff_Inf_8u_C3CMR :
@@ -2763,7 +2776,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
                    return normType == NORM_L2SQR ? (double)(norm * norm) : (double)norm;
                }
                setIppErrorStatus();
-            }
+            }*/
        }
        else
        {
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -829,7 +829,7 @@ OCL_TEST_P(Pow, Mat)
 {
    static const double pows[] = { -4, -1, -2.5, 0, 1, 2, 3.7, 4 };

-    for (int j = 0; j < test_loop_times; j++)
+    for (int j = 0; j < 1/*test_loop_times*/; j++)
        for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
        {
            SCOPED_TRACE(pows[k]);
@@ -1203,7 +1203,7 @@ OCL_TEST_P(MinMaxIdx_Mask, Mat)

 static bool relativeError(double actual, double expected, double eps)
 {
-    return std::abs(actual - expected) / actual < eps;
+    return std::abs(actual - expected) < eps*(1 + std::abs(actual));
 }

 typedef ArithmTestBase Norm;
@@ -1230,7 +1230,7 @@ OCL_TEST_P(Norm, NORM_INF_1arg_mask)
        OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_INF, mask_roi));
        OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_INF, umask_roi));

-        EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        EXPECT_NEAR(cpuRes, gpuRes, 0.2);
    }
 }

@@ -1302,7 +1302,7 @@ OCL_TEST_P(Norm, NORM_INF_2args)
            OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type));
            OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type));

-            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+            EXPECT_NEAR(cpuRes, gpuRes, 0.2);
        }
 }

--- a/modules/core/test/ocl/test_dft.cpp
+++ b/modules/core/test/ocl/test_dft.cpp
@@ -48,17 +48,26 @@

 #ifdef HAVE_OPENCL

+enum OCL_FFT_TYPE
+{
+    R2R = 0,
+    C2R = 1,
+    R2C = 2,
+    C2C = 3
+};
+
 namespace cvtest {
 namespace ocl {

 ////////////////////////////////////////////////////////////////////////////
 // Dft

-PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
+PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
 {
    cv::Size dft_size;
-    int	dft_flags, depth;
-    bool inplace;
+    int	dft_flags, depth, cn, dft_type;
+    bool hint;
+    bool is1d;

    TEST_DECLARE_INPUT_PARAMETER(src);
    TEST_DECLARE_OUTPUT_PARAMETER(dst);
@@ -66,34 +75,50 @@ PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
    virtual void SetUp()
    {
        dft_size = GET_PARAM(0);
-        depth = GET_PARAM(1);
-        inplace = GET_PARAM(2);
+        dft_type = GET_PARAM(1);
+        depth = CV_32F;

        dft_flags = 0;
+        switch (dft_type)
+        {
+        case R2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 1; break;
+        case C2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 2; break;
+        case R2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 1; break;
+        case C2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 2; break;
+        }
+
+        if (GET_PARAM(2))
+            dft_flags |= cv::DFT_INVERSE;
        if (GET_PARAM(3))
            dft_flags |= cv::DFT_ROWS;
        if (GET_PARAM(4))
            dft_flags |= cv::DFT_SCALE;
-        if (GET_PARAM(5))
-            dft_flags |= cv::DFT_INVERSE;
+        hint = GET_PARAM(5);
+        is1d = (dft_flags & DFT_ROWS) != 0 || dft_size.height == 1;
    }

-    void generateTestData(int cn = 2)
+    void generateTestData()
    {
        src = randomMat(dft_size, CV_MAKE_TYPE(depth, cn), 0.0, 100.0);
        usrc = src.getUMat(ACCESS_READ);
-
-        if (inplace)
-            dst = src, udst = usrc;
    }
 };

-OCL_TEST_P(Dft, C2C)
+OCL_TEST_P(Dft, Mat)
 {
    generateTestData();

-    OCL_OFF(cv::dft(src, dst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
-    OCL_ON(cv::dft(usrc, udst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
+    int nonzero_rows = hint ? src.cols - randomInt(1, src.rows-1) : 0;
+    OCL_OFF(cv::dft(src, dst, dft_flags, nonzero_rows));
+    OCL_ON(cv::dft(usrc, udst, dft_flags, nonzero_rows));
+
+    // In case forward R2C 1d tranform dst contains only half of output
+    // without complex conjugate
+    if (dft_type == R2C && is1d && (dft_flags & cv::DFT_INVERSE) == 0)
+    {
+        dst = dst(cv::Range(0, dst.rows), cv::Range(0, dst.cols/2 + 1));
+        udst = udst(cv::Range(0, udst.rows), cv::Range(0, udst.cols/2 + 1));
+    }

    double eps = src.size().area() * 1e-4;
    EXPECT_MAT_NEAR(dst, udst, eps);
@@ -150,15 +175,15 @@ OCL_TEST_P(MulSpectrums, Mat)

 OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool()));

-OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20),
-                                                       cv::Size(512, 1), cv::Size(1024, 768)),
-                                               Values(CV_32F, CV_64F),
-                                               Bool(), // inplace
+OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)),
+                                               Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R),
+                                               Bool(), // DFT_INVERSE
                                               Bool(), // DFT_ROWS
                                               Bool(), // DFT_SCALE
-                                               Bool()) // DFT_INVERSE
+                                               Bool()  // hint
+                                               )
                            );

 } } // namespace cvtest::ocl

-#endif // HAVE_OPENCL
+#endif // HAVE_OPENCL