Merge pull request #705 from bitwangyaoyao:2.4_oclFix

2013-03-26 18:38:36 +04:00 · 2013-03-26 18:38:36 +04:00 · 6c99b5c9e5
commit 6c99b5c9e5
parent 6a6ae355b1 d6f1ad8c14
14 changed files with 965 additions and 1547 deletions
--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@ -43,7 +43,7 @@ if(OPENCL_FOUND)
  set(OPENCL_LIBRARIES    ${OPENCL_LIBRARY})

  if (X86_64)
-    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
+    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib64/import)
  elseif (X86)
    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
  endif()
--- a/cmake/cl2cpp.cmake
+++ b/cmake/cl2cpp.cmake
@ -18,6 +18,7 @@ foreach(cl ${cl_list})
  string(REPLACE "\t" "  " lines "${lines}")

  string(REGEX REPLACE "/\\*([^*]/|\\*[^/]|[^*/])*\\*/" ""   lines "${lines}") # multiline comments
+  string(REGEX REPLACE "/\\*([^\n])*\\*/"               ""   lines "${lines}") # single-line comments
  string(REGEX REPLACE "[ ]*//[^\n]*\n"                 "\n" lines "${lines}") # single-line comments
  string(REGEX REPLACE "\n[ ]*(\n[ ]*)*"                "\n" lines "${lines}") # empty lines & leading whitespace
  string(REGEX REPLACE "^\n"                            ""   lines "${lines}") # leading new line
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@ -44,6 +44,7 @@
 //M*/

 #include "precomp.hpp"
+
 #include <functional>
 #include <iterator>
 #include <vector>
@ -60,10 +61,11 @@ namespace cv
    }
 }

-template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, int distType)
 {
+    assert(query.type() == CV_32F);
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
@ -91,20 +93,21 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat

        std::string kernelName = "BruteForceMatch_UnrollMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
                         const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
 {
 }

-template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
           const oclMat &trainIdx, const oclMat &distance, int distType)
 {
+    assert(query.type() == CV_32F);
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
@ -130,21 +133,22 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,

        std::string kernelName = "BruteForceMatch_Match";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
           const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
 {
 }

 //radius_matchUnrolledCached
-template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
+    assert(query.type() == CV_32F);
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
@ -176,15 +180,16 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist

        std::string kernelName = "BruteForceMatch_RadiusUnrollMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

 //radius_match
-template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                  const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
+    assert(query.type() == CV_32F);
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
@ -214,263 +219,70 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c

        std::string kernelName = "BruteForceMatch_RadiusMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
-        //float *dis = (float *)clEnqueueMapBuffer(ctx->impl->clCmdQueue, (cl_mem)distance.data, CL_TRUE, CL_MAP_READ, 0, 8, 0, NULL, NULL, NULL);
-        //printf("%f, %f\n", dis[0], dis[1]);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-// with mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
+static void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                     const oclMat &trainIdx, const oclMat &distance, int distType)
 {
+    const oclMat zeroMask;
+    const oclMat &tempMask = mask.data ? mask : zeroMask;
    if (query.cols <= 64)
    {
-        matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType);
+        matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType);
    }
    else if (query.cols <= 128)
    {
-        matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx,  distance, distType);
+        matchUnrolledCached<16, 128>(query, train, tempMask, trainIdx,  distance, distType);
    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
-    }*/
    else
    {
-        match<16, T>(query, train, mask, trainIdx, distance, distType);
+        match<16>(query, train, tempMask, trainIdx, distance, distType);
    }
 }

-// without mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &trainIdx, const oclMat &distance, int distType)
-{
-    oclMat mask;
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx,  distance, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance);
-    }*/
-    else
-    {
-        match<16, T>(query, train, mask, trainIdx, distance, distType);
-    }
-}
-
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &mask,
+static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &mask,
                     const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, int distType)
 {
+    const oclMat zeroMask;
+    const oclMat &tempMask = mask.data ? mask : zeroMask;
    if (query.cols <= 64)
    {
-        matchUnrolledCached<16, 64, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
+        matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
    }
    else if (query.cols <= 128)
    {
-        matchUnrolledCached<16, 128, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
+        matchUnrolledCached<16, 128>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }*/
    else
    {
-        match<16, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
-    }
-}
-
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &trainIdx,
-                     const oclMat &imgIdx, const oclMat &distance, int distType)
-{
-    oclMat mask;
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-    }*/
-    else
-    {
-        match<16, T>(query, trains, n, mask, trainIdx, imgIdx, distance, distType);
+        match<16>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
    }
 }

 //radius matchDispatcher
-// with mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
+static void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
                     const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
+    const oclMat zeroMask;
+    const oclMat &tempMask = mask.data ? mask : zeroMask;
    if (query.cols <= 64)
    {
-        matchUnrolledCached<16, 64, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
+        matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
    }
    else if (query.cols <= 128)
    {
-        matchUnrolledCached<16, 128, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
+        matchUnrolledCached<16, 128>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }*/
    else
    {
-        radius_match<16, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-}
-
-// without mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &trainIdx,
-                     const oclMat &distance, const oclMat &nMatches, int distType)
-{
-    oclMat mask;
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        radius_match<16, T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-}
-
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxDistance, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        match<16, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-}
-
-// without mask
-template < typename T/*, typename Mask*/ >
-void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxDistance, const oclMat &trainIdx,
-                     const oclMat &distance, const oclMat &nMatches, int distType)
-{
-    oclMat mask;
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolledCached<16, 128, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        match<16, T>(query, train, n, maxDistance, mask, trainIdx, distance, nMatches, distType);
+        radius_match<16>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
    }
 }

 //knn match Dispatcher
-template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                             const oclMat &trainIdx, const oclMat &distance, int distType)
 {
@ -501,11 +313,11 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl

        std::string kernelName = "BruteForceMatch_knnUnrollMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
               const oclMat &trainIdx, const oclMat &distance, int distType)
 {
@ -534,11 +346,11 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,

        std::string kernelName = "BruteForceMatch_knnMatch";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
 void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -567,11 +379,11 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat

        std::string kernelName = "BruteForceMatch_calcDistanceUnrolled";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

-template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
+template < int BLOCK_SIZE/*, typename Mask*/ >
 void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -598,69 +410,43 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask

        std::string kernelName = "BruteForceMatch_calcDistance";

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
    }
 }

 ///////////////////////////////////////////////////////////////////////////////
 // Calc Distance dispatcher
-template < typename T/*, typename Mask*/ >
-void calcDistanceDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
+static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                            const oclMat &allDist, int distType)
 {
    if (query.cols <= 64)
    {
-        calcDistanceUnrolled<16, 64, T>(query, train, mask, allDist, distType);
+        calcDistanceUnrolled<16, 64>(query, train, mask, allDist, distType);
    }
    else if (query.cols <= 128)
    {
-        calcDistanceUnrolled<16, 128, T>(query, train, mask, allDist, distType);
+        calcDistanceUnrolled<16, 128>(query, train, mask, allDist, distType);
    }
-    /*else if (query.cols <= 256)
-    {
-        calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
-    }
-    else if (query.cols <= 512)
-    {
-        calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
-    }*/
    else
    {
-        calcDistance<16, T>(query, train, mask, allDist, distType);
+        calcDistance<16>(query, train, mask, allDist, distType);
    }
 }

-template < typename T/*, typename Mask*/ >
-void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
+static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                      const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    if (query.cols <= 64)
    {
-        knn_matchUnrolledCached<16, 64, T>(query, train, mask, trainIdx, distance, distType);
+        knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType);
    }
    else if (query.cols <= 128)
    {
-        knn_matchUnrolledCached<16, 128, T>(query, train, mask, trainIdx, distance, distType);
+        knn_matchUnrolledCached<16, 128>(query, train, mask, trainIdx, distance, distType);
    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    else if (query.cols <= 512)
-    {
-        matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }
-    else if (query.cols <= 1024)
-    {
-        matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
-    }*/
    else
    {
-        knn_match<16, T>(query, train, mask, trainIdx, distance, distType);
+        knn_match<16>(query, train, mask, trainIdx, distance, distType);
    }
 }

@ -686,7 +472,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o
        //args.push_back( make_pair( sizeof(cl_int), (void *)&train.cols ));
        //args.push_back( make_pair( sizeof(cl_int), (void *)&query.step ));

-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
+        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, trainIdx.depth(), -1);
    }
 }

@ -695,206 +481,22 @@ static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &
    findKnnMatch<256>(k, trainIdx, distance, allDist, distType);
 }

-//with mask
-template < typename T/*, typename Mask*/ >
-void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
+static void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
                      const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
 {
+    const oclMat zeroMask;
+    const oclMat &tempMask = mask.data ? mask : zeroMask;
    if (k == 2)
    {
-        match2Dispatcher<T>(query, train, mask, trainIdx, distance, distType);
+        match2Dispatcher(query, train, tempMask, trainIdx, distance, distType);
    }
    else
    {
-        calcDistanceDispatcher<T>(query, train, mask, allDist, distType);
+        calcDistanceDispatcher(query, train, tempMask, allDist, distType);
        findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType);
    }
 }

-//without mask
-template < typename T/*, typename Mask*/ >
-void kmatchDispatcher(const oclMat &query, const oclMat &train, int k,
-                      const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
-{
-    oclMat mask;
-    if (k == 2)
-    {
-        match2Dispatcher<T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else
-    {
-        calcDistanceDispatcher<T>(query, train, mask, allDist, distType);
-        findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType);
-    }
-}
-
-
-
-template <typename T>
-void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance)
-{
-    int distType = 0;
-    if (mask.data)
-    {
-        matchDispatcher<T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher< T >(query, train, trainIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchL1_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks,
-                     const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance)
-{
-    int distType = 0;
-
-    if (masks.data)
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance)
-{
-    int distType = 1;
-    if (mask.data)
-    {
-        matchDispatcher<T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher<T >(query, train, trainIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchL2_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks,
-                     const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance)
-{
-    int distType = 1;
-    if (masks.data)
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, const oclMat &mask,
-                          const oclMat &trainIdx, const oclMat &distance)
-{
-    int distType = 2;
-    if (mask.data)
-    {
-        matchDispatcher<T>(query, train, mask, trainIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher< T >(query, train, trainIdx, distance, distType);
-    }
-}
-
-template <typename T>
-void ocl_matchHamming_gpu(const oclMat &query, const oclMat &trains, const oclMat &masks,
-                          const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance)
-{
-    int distType = 2;
-    if (masks.data)
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, masks, trainIdx, imgIdx, distance, distType);
-    }
-    else
-    {
-        matchDispatcher<T>(query, (const oclMat *)trains.ptr(), trains.cols, trainIdx, imgIdx, distance, distType);
-    }
-}
-
-// knn caller
-template <typename T>
-void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist)
-{
-    int distType = 0;
-
-    if (mask.data)
-        kmatchDispatcher<T>(query, train, k, mask, trainIdx, distance, allDist, distType);
-    else
-        kmatchDispatcher<T>(query, train, k, trainIdx, distance, allDist, distType);
-}
-
-template <typename T>
-void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist)
-{
-    int distType = 1;
-
-    if (mask.data)
-        kmatchDispatcher<T>(query, train, k, mask, trainIdx, distance, allDist, distType);
-    else
-        kmatchDispatcher<T>(query, train, k, trainIdx, distance, allDist, distType);
-}
-
-template <typename T>
-void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
-                          const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist)
-{
-    int distType = 2;
-
-    if (mask.data)
-        kmatchDispatcher<T>(query, train, k, mask, trainIdx, distance, allDist, distType);
-    else
-        kmatchDispatcher<T>(query, train, k,  trainIdx, distance, allDist, distType);
-}
-
-//radius caller
-template <typename T>
-void ocl_matchL1_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches)
-{
-    int distType = 0;
-
-    if (mask.data)
-        matchDispatcher<T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    else
-        matchDispatcher<T>(query, train, maxDistance, trainIdx, distance, nMatches, distType);
-}
-
-template <typename T>
-void ocl_matchL2_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches)
-{
-    int distType = 1;
-
-    if (mask.data)
-        matchDispatcher<T>(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-    else
-        matchDispatcher<T>(query, train, maxDistance, trainIdx, distance, nMatches, distType);
-}
-
-template <typename T>
-void ocl_matchHamming_gpu(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
-                          const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches)
-{
-    int distType = 2;
-
-    if (mask.data)
-        matchDispatcher<T>(query, train, maxDistance, mask, trainIdx, distance,  nMatches, distType);
-    else
-        matchDispatcher<T>(query, train, maxDistance, trainIdx, distance, nMatches, distType);
-}
-
 cv::ocl::BruteForceMatcher_OCL_base::BruteForceMatcher_OCL_base(DistType distType_) : distType(distType_)
 {
 }
@ -929,38 +531,28 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const
 {
    if (query.empty() || train.empty())
        return;
+    
+    // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
+    int callType = query.depth();
+    char cvFuncName[] = "singleMatch";
+    if (callType != 5)
+        CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n");

-    typedef void (*caller_t)(const oclMat & query, const oclMat & train, const oclMat & mask,
-                             const oclMat & trainIdx, const oclMat & distance);
-
-    static const caller_t callers[3][6] =
+    if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
+        || callType != 2 || callType != 4)))
    {
-        {
-            ocl_matchL1_gpu<unsigned char>, 0/*ocl_matchL1_gpu<signed char>*/,
-            ocl_matchL1_gpu<unsigned short>, ocl_matchL1_gpu<short>,
-            ocl_matchL1_gpu<int>, ocl_matchL1_gpu<float>
-        },
-        {
-            0/*ocl_matchL2_gpu<unsigned char>*/, 0/*ocl_matchL2_gpu<signed char>*/,
-            0/*ocl_matchL2_gpu<unsigned short>*/, 0/*ocl_matchL2_gpu<short>*/,
-            0/*ocl_matchL2_gpu<int>*/, ocl_matchL2_gpu<float>
-        },
-        {
-            ocl_matchHamming_gpu<unsigned char>, 0/*ocl_matchHamming_gpu<signed char>*/,
-            ocl_matchHamming_gpu<unsigned short>, 0/*ocl_matchHamming_gpu<short>*/,
-            ocl_matchHamming_gpu<int>, 0/*ocl_matchHamming_gpu<float>*/
-        }
-    };
+        CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+    }

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.cols == query.cols && train.type() == query.type());

-    const int nQuery = query.rows;
-    trainIdx.create(1, nQuery, CV_32S);
-    distance.create(1, nQuery, CV_32F);
+    trainIdx.create(1, query.rows, CV_32S);
+    distance.create(1, query.rows, CV_32F);

-    caller_t func = callers[distType][query.depth()];
-    func(query, train, mask, trainIdx, distance);
+    matchDispatcher(query, train, mask, trainIdx, distance, distType);
+exit:
+    return;
 }

 void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &distance, vector<DMatch> &matches)
@ -1062,40 +654,27 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c
    if (query.empty() || trainCollection.empty())
        return;

-    typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
-                             const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);
+    // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
+    int callType = query.depth();
+    char cvFuncName[] = "matchCollection";
+    if (callType != 5)
+        CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n");

-    static const caller_t callers[3][6] =
+    if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
+        || callType != 2 || callType != 4)))
    {
-        {
-            ocl_matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-            ocl_matchL1_gpu<unsigned short>, ocl_matchL1_gpu<short>,
-            ocl_matchL1_gpu<int>, ocl_matchL1_gpu<float>
-        },
-        {
-            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-            0/*matchL2_gpu<int>*/, ocl_matchL2_gpu<float>
-        },
-        {
-            ocl_matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-            ocl_matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-            ocl_matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-        }
-    };
+        CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+    }

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);

-    const int nQuery = query.rows;
+    trainIdx.create(1, query.rows, CV_32S);
+    imgIdx.create(1, query.rows, CV_32S);
+    distance.create(1, query.rows, CV_32F);

-    trainIdx.create(1, nQuery, CV_32S);
-    imgIdx.create(1, nQuery, CV_32S);
-    distance.create(1, nQuery, CV_32F);
-
-    caller_t func = callers[distType][query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, trainCollection, masks, trainIdx, imgIdx, distance);
+    matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType);
+exit:
+    return;
 }

 void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, vector<DMatch> &matches)
@ -1164,52 +743,39 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co
    if (query.empty() || train.empty())
        return;

-    typedef void (*caller_t)(const oclMat & query, const oclMat & train, int k, const oclMat & mask,
-                             const oclMat & trainIdx, const oclMat & distance, const oclMat & allDist);
+    // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
+    int callType = query.depth();

-    static const caller_t callers[3][6] =
+    char cvFuncName[] = "knnMatchSingle";
+    if (callType != 5)
+        CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+
+    if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
+        || callType != 2 || callType != 4)))
    {
-        {
-            ocl_matchL1_gpu<unsigned char>, 0/*ocl_matchL1_gpu<signed char>*/,
-            ocl_matchL1_gpu<unsigned short>, ocl_matchL1_gpu<short>,
-            ocl_matchL1_gpu<int>, ocl_matchL1_gpu<float>
-        },
-        {
-            0/*ocl_matchL2_gpu<unsigned char>*/, 0/*ocl_matchL2_gpu<signed char>*/,
-            0/*ocl_matchL2_gpu<unsigned short>*/, 0/*ocl_matchL2_gpu<short>*/,
-            0/*ocl_matchL2_gpu<int>*/, ocl_matchL2_gpu<float>
-        },
-        {
-            ocl_matchHamming_gpu<unsigned char>, 0/*ocl_matchHamming_gpu<signed char>*/,
-            ocl_matchHamming_gpu<unsigned short>, 0/*ocl_matchHamming_gpu<short>*/,
-            ocl_matchHamming_gpu<int>, 0/*ocl_matchHamming_gpu<float>*/
-        }
-    };
+        CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+    }

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);

-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
-
    if (k == 2)
    {
-        trainIdx.create(1, nQuery, CV_32SC2);
-        distance.create(1, nQuery, CV_32FC2);
+        trainIdx.create(1, query.rows, CV_32SC2);
+        distance.create(1, query.rows, CV_32FC2);
    }
    else
    {
-        trainIdx.create(nQuery, k, CV_32S);
-        distance.create(nQuery, k, CV_32F);
-        allDist.create(nQuery, nTrain, CV_32FC1);
+        trainIdx.create(query.rows, k, CV_32S);
+        distance.create(query.rows, k, CV_32F);
+        allDist.create(query.rows, train.rows, CV_32FC1);
    }

    trainIdx.setTo(Scalar::all(-1));

-    caller_t func = callers[distType][query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, train, k, mask, trainIdx, distance, allDist);
+    kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType);
+exit:
+    return;
 }

 void cv::ocl::BruteForceMatcher_OCL_base::knnMatchDownload(const oclMat &trainIdx, const oclMat &distance, vector< vector<DMatch> > &matches, bool compactResult)
@ -1394,8 +960,6 @@ namespace
 void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, vector< vector<DMatch> > &matches, int k,
        const vector<oclMat> &masks, bool compactResult)
 {
-
-
    if (k == 2)
    {
        oclMat trainCollection;
@ -1455,50 +1019,34 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query,
    if (query.empty() || train.empty())
        return;

-    typedef void (*caller_t)(const oclMat & query, const oclMat & train, float maxDistance, const oclMat & mask,
-                             const oclMat & trainIdx, const oclMat & distance, const oclMat & nMatches);
+    // match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
+    int callType = query.depth();
+    char cvFuncName[] = "radiusMatchSingle";
+    if (callType != 5)
+        CV_ERROR(CV_UNSUPPORTED_FORMAT_ERR, "BruteForceMatch OpenCL only support float type query!\n");

-    //#if 0
-    static const caller_t callers[3][6] =
+    if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
+        || callType != 2 || callType != 4)))
    {
-        {
-            ocl_matchL1_gpu<unsigned char>, 0/*ocl_matchL1_gpu<signed char>*/,
-            ocl_matchL1_gpu<unsigned short>, ocl_matchL1_gpu<short>,
-            ocl_matchL1_gpu<int>, ocl_matchL1_gpu<float>
-        },
-        {
-            0/*ocl_matchL2_gpu<unsigned char>*/, 0/*ocl_matchL2_gpu<signed char>*/,
-            0/*ocl_matchL2_gpu<unsigned short>*/, 0/*ocl_matchL2_gpu<short>*/,
-            0/*ocl_matchL2_gpu<int>*/, ocl_matchL2_gpu<float>
-        },
-        {
-            ocl_matchHamming_gpu<unsigned char>, 0/*ocl_matchHamming_gpu<signed char>*/,
-            ocl_matchHamming_gpu<unsigned short>, 0/*ocl_matchHamming_gpu<short>*/,
-            ocl_matchHamming_gpu<int>, 0/*ocl_matchHamming_gpu<float>*/
-        }
-    };
-    //#endif
-
-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
+        CV_ERROR(CV_UNSUPPORTED_DEPTH_ERR, "BruteForceMatch OpenCL only support float type query!\n");
+    }

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);
-    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));
+    CV_Assert(trainIdx.empty() || (trainIdx.rows == query.rows && trainIdx.size() == distance.size()));

-    nMatches.create(1, nQuery, CV_32SC1);
+    nMatches.create(1, query.rows, CV_32SC1);
    if (trainIdx.empty())
    {
-        trainIdx.create(nQuery, std::max((nTrain / 100), 10), CV_32SC1);
-        distance.create(nQuery, std::max((nTrain / 100), 10), CV_32FC1);
+        trainIdx.create(query.rows, std::max((train.rows/ 100), 10), CV_32SC1);
+        distance.create(query.rows, std::max((train.rows/ 100), 10), CV_32FC1);
    }

    nMatches.setTo(Scalar::all(0));

-    caller_t func = callers[distType][query.depth()];
-    //CV_Assert(func != 0);
-    //func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
-    func(query, train, maxDistance, mask, trainIdx, distance, nMatches);
+    matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
+exit:
+    return;
 }

 void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
@ -1697,5 +1245,3 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto
    radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
    radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
 }
-
-
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@ -953,8 +953,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        //int flag = 0;

        oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1);
-        oclMat gsum(totalheight, gimg.cols + 1, CV_32SC1);
-        oclMat gsqsum(totalheight, gimg.cols + 1, CV_32FC1);
+        oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1);
+        oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1);

        //cl_mem cascadebuffer;
        cl_mem stagebuffer;
--- a/modules/ocl/src/match_template.cpp
+++ b/modules/ocl/src/match_template.cpp
@ -71,6 +71,9 @@ namespace cv
        void matchTemplate_SQDIFF_NORMED(
            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);

+        void convolve_32F(
+            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
+
        void matchTemplate_CCORR(
            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);

@ -90,41 +93,65 @@ namespace cv
        void matchTemplateNaive_CCORR(
            const oclMat &image, const oclMat &templ, oclMat &result, int cn);

+        void extractFirstChannel_32F(
+            const oclMat &image, oclMat &result);
+
        // Evaluates optimal template's area threshold. If
        // template's area is less  than the threshold, we use naive match
        // template version, otherwise FFT-based (if available)
-        static int getTemplateThreshold(int method, int depth)
+        static bool useNaive(int , int , Size )
        {
-            switch (method)
-            {
-            case CV_TM_CCORR:
-                if (depth == CV_32F) return 250;
-                if (depth == CV_8U) return 300;
-                break;
-            case CV_TM_SQDIFF:
-                if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F
-                if (depth == CV_8U) return 300;
-                break;
-            }
-            CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
-            return 0;
+            // FIXME!
+            //   always use naive until convolve is imported
+            return true; 
        }

        //////////////////////////////////////////////////////////////////////
        // SQDIFF
        void matchTemplate_SQDIFF(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &)
+            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
        {
            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+            if (useNaive(CV_TM_SQDIFF, image.depth(), templ.size()))
            {
                matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
                return;
            }
            else
            {
-                // TODO
-                CV_Error(CV_StsBadArg, "Not supported yet for this size template");
+                buf.image_sqsums.resize(1);
+                
+                // TODO, add double support for ocl::integral
+                // use CPU integral temporarily
+                Mat sums, sqsums;
+                cv::integral(Mat(image.reshape(1)), sums, sqsums);
+                buf.image_sqsums[0] = sqsums;
+
+                unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
+                matchTemplate_CCORR(image, templ, result, buf);
+
+                //port CUDA's matchTemplatePrepared_SQDIFF_8U
+                Context *clCxt = image.clCxt;
+                string kernelName = "matchTemplate_Prepared_SQDIFF";
+                vector< pair<size_t, const void *> > args;
+
+                args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
+                args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
+                args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+                args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+                size_t globalThreads[3] = {result.cols, result.rows, 1};
+                size_t localThreads[3]  = {16, 16, 1};
+
+                const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
+                openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
            }
        }

@ -134,7 +161,6 @@ namespace cv
            matchTemplate_CCORR(image, templ, result, buf);
            buf.image_sums.resize(1);

-
            integral(image.reshape(1), buf.image_sums[0]);

            unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
@ -156,7 +182,7 @@ namespace cv
            args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));

            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
        }

@ -191,33 +217,39 @@ namespace cv
            args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));

            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
        }

        //////////////////////////////////////////////////////////////////////
        // CCORR
+        void convolve_32F(
+            const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &)
+        {
+            CV_Error(-1, "convolve is not fully implemented yet");
+        }
+
        void matchTemplate_CCORR(
            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
        {
            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
+            if (useNaive(CV_TM_CCORR, image.depth(), templ.size()))
            {
                matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
                return;
            }
            else
            {
-                CV_Error(CV_StsBadArg, "Not supported yet for this size template");
                if(image.depth() == CV_8U && templ.depth() == CV_8U)
                {
                    image.convertTo(buf.imagef, CV_32F);
                    templ.convertTo(buf.templf, CV_32F);
+                    convolve_32F(buf.imagef, buf.templf, result, buf);
+                }
+                else
+                {
+                    convolve_32F(image, templ, result, buf);
                }
-                CV_Assert(image.oclchannels() == 1);
-                oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.oclchannels()));
-                filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0));
-                result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1));
            }
        }

@ -249,7 +281,7 @@ namespace cv
            args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));

            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
        }

@ -284,7 +316,7 @@ namespace cv
            args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));

            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};
            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
        }
        //////////////////////////////////////////////////////////////////////
@ -301,7 +333,7 @@ namespace cv

            kernelName = "matchTemplate_Prepared_CCOFF";
            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};

            vector< pair<size_t, const void *> > args;
            args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
@ -313,22 +345,22 @@ namespace cv
            args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
            args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
            args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+            Vec4f templ_sum = Vec4f::all(0);
            // to be continued in the following section
            if(image.oclchannels() == 1)
            {
                buf.image_sums.resize(1);
                integral(image, buf.image_sums[0]);

-                float templ_sum = 0;
-                templ_sum = (float)sum(templ)[0] / templ.size().area();
+                templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) );
+                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
            }
            else
            {
-                Vec4f templ_sum = Vec4f::all(0);
+                
                split(image, buf.images);
                templ_sum = sum(templ) / templ.size().area();
                buf.image_sums.resize(buf.images.size());
@ -374,7 +406,7 @@ namespace cv

            kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {32, 8, 1};
+            size_t localThreads[3]  = {16, 16, 1};

            vector< pair<size_t, const void *> > args;
            args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
@ -387,20 +419,22 @@ namespace cv
            args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
            args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
            args.push_back( make_pair( sizeof(cl_float), (void *)&scale) );
+
+            Vec4f templ_sum   = Vec4f::all(0);
+            Vec4f templ_sqsum = Vec4f::all(0);
            // to be continued in the following section
            if(image.oclchannels() == 1)
            {
                buf.image_sums.resize(1);
                buf.image_sqsums.resize(1);
                integral(image, buf.image_sums[0], buf.image_sqsums[0]);
-                float templ_sum = 0;
-                float templ_sqsum = 0;
-                templ_sum   = (float)sum(templ)[0];

-                templ_sqsum = sqrSum(templ)[0];
+                templ_sum[0]   = (float)sum(templ)[0];

-                templ_sqsum -= scale * templ_sum * templ_sum;
-                templ_sum   *= scale;
+                templ_sqsum[0] = sqrSum(templ)[0];
+
+                templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
+                templ_sum[0]   *= scale;

                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
@ -408,13 +442,11 @@ namespace cv
                args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
                args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
-                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) );
-                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum) );
+                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
+                args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
            }
            else
            {
-                Vec4f templ_sum   = Vec4f::all(0);
-                Vec4f templ_sqsum = Vec4f::all(0);

                split(image, buf.images);
                templ_sum   = sum(templ);
@ -465,7 +497,27 @@ namespace cv
            }
            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
        }
+        void extractFirstChannel_32F(const oclMat &image, oclMat &result)
+        {
+            Context *clCxt = image.clCxt;
+            string kernelName;

+            kernelName = "extractFirstChannel";
+            size_t globalThreads[3] = {result.cols, result.rows, 1};
+            size_t localThreads[3]  = {16, 16, 1};
+
+            vector< pair<size_t, const void *> > args;
+            args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data) );
+            args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
+            args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
+            args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
+            args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
+            args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
+            args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
+            args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
+
+            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
+        }
    }/*ocl*/
 } /*cv*/

--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@ -143,7 +143,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
        args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step ));

        openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
-
+        
        cv::Mat dst(dst_a);
        a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
        if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE))
@ -277,16 +277,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
        blocky = size.height/TILE_SIZE;
    else
        blocky = size.height/TILE_SIZE + 1;
-    cv::ocl::oclMat dst_m00(blocky, blockx, CV_64FC1);
-    cv::ocl::oclMat dst_m10(blocky, blockx, CV_64FC1);
-    cv::ocl::oclMat dst_m01(blocky, blockx, CV_64FC1);
-    cv::ocl::oclMat dst_m20(blocky, blockx, CV_64FC1);
-    cv::ocl::oclMat dst_m11(blocky, blockx, CV_64FC1);
-    cv::ocl::oclMat dst_m02(blocky, blockx, CV_64FC1);
-    cv::ocl::oclMat dst_m30(blocky, blockx, CV_64FC1);
-    cv::ocl::oclMat dst_m21(blocky, blockx, CV_64FC1);
-    cv::ocl::oclMat dst_m12(blocky, blockx, CV_64FC1);
-    cv::ocl::oclMat dst_m03(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1);
    cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
    int tile_width  = std::min(size.width,TILE_SIZE);
    int tile_height = std::min(size.height,TILE_SIZE);
@ -299,25 +290,17 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m00.cols ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m00.step ));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&type ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&depth ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&cn ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&coi ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&binary ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
-    openCLExecuteKernel(dst_m00.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);

    size_t localThreadss[3]  = { 128, 1, 1};
    size_t globalThreadss[3] = { 128, 1, 1};
@ -327,20 +310,12 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width ));
    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
-    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
-    openCLExecuteKernel(dst_m00.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
+    args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
+    args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
+    openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
    double* dstsum = new double[10];
    memset(dstsum,0,10*sizeof(double));
-    openCLReadBuffer(dst_m00.clCxt,sum,(void *)dstsum,10*sizeof(double));
+    openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double));
    mom->m00 = dstsum[0];
    mom->m10 = dstsum[1];
    mom->m01 = dstsum[2];
@ -351,6 +326,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
    mom->m21 = dstsum[7];
    mom->m12 = dstsum[8];
    mom->m03 = dstsum[9];
+    delete [] dstsum;

    icvCompleteMomentState( mom );
 }
--- a/modules/ocl/src/opencl/brute_force_match.cl
+++ b/modules/ocl/src/opencl/brute_force_match.cl
@ -5,19 +5,93 @@ int bit1Count(float x)
 {
    int c = 0;
    int ix = (int)x;
-
    for (int i = 0 ; i < 32 ; i++)
    {
        c += ix & 0x1;
        ix >>= 1;
    }
-
    return (float)c;
 }
+
+float reduce_block(__local float *s_query,
+                   __local float *s_train,
+                   int block_size,
+                   int lidx,
+                   int lidy,
+                   int distType
+                  )
+{
+    /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+    sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+    float result = 0;
+    switch(distType)
+    {
+    case 0:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+        }
+        break;
+    case 1:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+            result += qr * qr;
+        }
+        break;
+    case 2:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
+        }
+        break;
+    }
+    return result;
+}
+
+float reduce_multi_block(__local float *s_query,
+                         __local float *s_train,
+                         int max_desc_len,
+                         int block_size,
+                         int block_index,
+                         int lidx,
+                         int lidy,
+                         int distType
+                        )
+{
+    /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+    sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+    float result = 0;
+    switch(distType)
+    {
+    case 0:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            result += fabs(s_query[lidy * max_desc_len + block_index * block_size + j] -  s_train[j * block_size + lidx]);
+        }
+        break;
+    case 1:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            float qr = s_query[lidy * max_desc_len + block_index * block_size + j] -  s_train[j * block_size + lidx];
+            result += qr * qr;
+        }
+        break;
+    case 2:
+        for (int j = 0 ; j < block_size ; j++)
+        {
+            //result += popcount((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+            result += bit1Count((uint)s_query[lidy * max_desc_len + block_index * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+        }
+        break;
+    }
+    return result;
+}
+
 /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
 local size: dim0 is block_size, dim1 is block_size.
 */
-__kernel void BruteForceMatch_UnrollMatch(
+__kernel void BruteForceMatch_UnrollMatch_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -42,7 +116,6 @@ __kernel void BruteForceMatch_UnrollMatch(
    __local float *s_train = sharebuffer + block_size * max_desc_len;

    int queryIdx = groupidx * block_size + lidy;
-
    // load the query into local memory.
    for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
    {
@ -55,11 +128,9 @@ __kernel void BruteForceMatch_UnrollMatch(

    // loopUnrolledCached to find the best trainIdx and best distance.
    volatile int imgIdx = 0;
-
    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0;
-
        for (int i = 0 ; i < max_desc_len / block_size ; i++)
        {
            //load a block_size * block_size block into local train.
@ -69,38 +140,7 @@ __kernel void BruteForceMatch_UnrollMatch(
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);

-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch (distType)
-            {
-                case 0:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-                case 1:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                        result += qr * qr;
-                    }
-
-                    break;
-                case 2:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-            }
+            result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);

            barrier(CLK_LOCAL_MEM_FENCE);
        }
@ -116,8 +156,8 @@ __kernel void BruteForceMatch_UnrollMatch(
    }

    barrier(CLK_LOCAL_MEM_FENCE);
-    __local float *s_distance = (__local float *)(sharebuffer);
-    __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+    __local float *s_distance = (__local float*)(sharebuffer);
+    __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);

    //find BestMatch
    s_distance += lidy * block_size;
@ -144,7 +184,7 @@ __kernel void BruteForceMatch_UnrollMatch(
    }
 }

-__kernel void BruteForceMatch_Match(
+__kernel void BruteForceMatch_Match_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -177,7 +217,6 @@ __kernel void BruteForceMatch_Match(
    {
        //Dist dist;
        float result = 0;
-
        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
        {
            const int loadx = lidx + i * block_size;
@ -193,38 +232,7 @@ __kernel void BruteForceMatch_Match(

            barrier(CLK_LOCAL_MEM_FENCE);

-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch (distType)
-            {
-                case 0:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-                case 1:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                        result += qr * qr;
-                    }
-
-                    break;
-                case 2:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
-                    }
-
-                    break;
-            }
+            result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);

            barrier(CLK_LOCAL_MEM_FENCE);
        }
@ -270,7 +278,7 @@ __kernel void BruteForceMatch_Match(
 }

 //radius_unrollmatch
-__kernel void BruteForceMatch_RadiusUnrollMatch(
+__kernel void BruteForceMatch_RadiusUnrollMatch_D5(
    __global float *query,
    __global float *train,
    float maxDistance,
@ -303,7 +311,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    __local float *s_train = sharebuffer + block_size * block_size;

    float result = 0;
-
    for (int i = 0 ; i < max_desc_len / block_size ; ++i)
    {
        //load a block_size * block_size block into local train.
@ -315,37 +322,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
        //synchronize to make sure each elem for reduceIteration in share memory is written already.
        barrier(CLK_LOCAL_MEM_FENCE);

-        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-        switch (distType)
-        {
-            case 0:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-                }
-
-                break;
-            case 1:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-
-                break;
-            case 2:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                }
-
-                break;
-        }
+        result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);

        barrier(CLK_LOCAL_MEM_FENCE);
    }
@ -354,7 +331,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    {
        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);

-        if (ind < bestTrainIdx_cols)
+        if(ind < bestTrainIdx_cols)
        {
            //bestImgIdx = imgIdx;
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
@ -364,7 +341,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
 }

 //radius_match
-__kernel void BruteForceMatch_RadiusMatch(
+__kernel void BruteForceMatch_RadiusMatch_D5(
    __global float *query,
    __global float *train,
    float maxDistance,
@ -396,7 +373,6 @@ __kernel void BruteForceMatch_RadiusMatch(
    __local float *s_train = sharebuffer + block_size * block_size;

    float result = 0;
-
    for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
    {
        //load a block_size * block_size block into local train.
@ -408,46 +384,16 @@ __kernel void BruteForceMatch_RadiusMatch(
        //synchronize to make sure each elem for reduceIteration in share memory is written already.
        barrier(CLK_LOCAL_MEM_FENCE);

-        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-        switch (distType)
-        {
-            case 0:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-                }
-
-                break;
-            case 1:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-
-                break;
-            case 2:
-
-                for (int j = 0 ; j < block_size ; ++j)
-                {
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                }
-
-                break;
-        }
+        result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);

        barrier(CLK_LOCAL_MEM_FENCE);
    }

    if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
    {
-        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+        unsigned int ind = atom_inc(nMatches + queryIdx);

-        if (ind < bestTrainIdx_cols)
+        if(ind < bestTrainIdx_cols)
        {
            //bestImgIdx = imgIdx;
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
@ -457,7 +403,7 @@ __kernel void BruteForceMatch_RadiusMatch(
 }


-__kernel void BruteForceMatch_knnUnrollMatch(
+__kernel void BruteForceMatch_knnUnrollMatch_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -496,11 +442,9 @@ __kernel void BruteForceMatch_knnUnrollMatch(

    //loopUnrolledCached
    volatile int imgIdx = 0;
-
    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0;
-
        for (int i = 0 ; i < max_desc_len / block_size ; i++)
        {
            const int loadX = lidx + i * block_size;
@ -511,38 +455,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
            //synchronize to make sure each elem for reduceIteration in share memory is written already.
            barrier(CLK_LOCAL_MEM_FENCE);

-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch (distType)
-            {
-                case 0:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-                case 1:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                        result += qr * qr;
-                    }
-
-                    break;
-                case 2:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-            }
+            result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);

            barrier(CLK_LOCAL_MEM_FENCE);
        }
@ -589,7 +502,6 @@ __kernel void BruteForceMatch_knnUnrollMatch(
        for (int i = 0 ; i < block_size ; i++)
        {
            float val = s_distance[i];
-
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
@ -640,7 +552,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
    }
 }

-__kernel void BruteForceMatch_knnMatch(
+__kernel void BruteForceMatch_knnMatch_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -673,8 +585,7 @@ __kernel void BruteForceMatch_knnMatch(
    for (int  t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0.0f;
-
-        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
+        for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
        {
            const int loadx = lidx + i * block_size;
            //load query and train into local memory
@ -689,38 +600,7 @@ __kernel void BruteForceMatch_knnMatch(

            barrier(CLK_LOCAL_MEM_FENCE);

-            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
-            switch (distType)
-            {
-                case 0:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                    }
-
-                    break;
-                case 1:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                        result += qr * qr;
-                    }
-
-                    break;
-                case 2:
-
-                    for (int j = 0 ; j < block_size ; j++)
-                    {
-                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
-                    }
-
-                    break;
-            }
+            result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);

            barrier(CLK_LOCAL_MEM_FENCE);
        }
@ -767,7 +647,6 @@ __kernel void BruteForceMatch_knnMatch(
        for (int i = 0 ; i < block_size ; i++)
        {
            float val = s_distance[i];
-
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
@ -818,7 +697,7 @@ __kernel void BruteForceMatch_knnMatch(
    }
 }

-kernel void BruteForceMatch_calcDistanceUnrolled(
+kernel void BruteForceMatch_calcDistanceUnrolled_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -836,7 +715,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
    /* Todo */
 }

-kernel void BruteForceMatch_calcDistance(
+kernel void BruteForceMatch_calcDistance_D5(
    __global float *query,
    __global float *train,
    //__global float *mask,
@ -853,7 +732,7 @@ kernel void BruteForceMatch_calcDistance(
    /* Todo */
 }

-kernel void BruteForceMatch_findBestMatch(
+kernel void BruteForceMatch_findBestMatch_D5(
    __global float *allDist,
    __global int *bestTrainIdx,
    __global float *bestDistance,
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@ -211,10 +211,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                int4 data = *(__global int4*)&sum[glb_off];
                int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);

+#if OFF
                lcldata[lcl_off] = data.x;
                lcldata[lcl_off+1] = data.y;
                lcldata[lcl_off+2] = data.z;
                lcldata[lcl_off+3] = data.w;
+#else
+                vstore4(data, 0, &lcldata[lcl_off]);
+#endif
            }

            lcloutindex[lcl_id] = 0;
@ -559,3 +563,7 @@ if(result)
 }
 }
 */
+
+
+
+
--- a/modules/ocl/src/opencl/match_template.cl
+++ b/modules/ocl/src/opencl/match_template.cl
@ -45,22 +45,28 @@

 #pragma OPENCL EXTENSION cl_amd_printf : enable

-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#if defined (DOUBLE_SUPPORT)

-#elif defined (__NVIDIA__)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif

-#if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__))
 #define TYPE_IMAGE_SQSUM double
 #else
-#define TYPE_IMAGE_SQSUM ulong
+#define TYPE_IMAGE_SQSUM float
+#endif
+
+#ifndef CN4
+#define CN4 1
+#else
+#define CN4 4
 #endif

 //////////////////////////////////////////////////
 // utilities
-#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox)
+#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4)
 #define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
 // normAcc* are accurate normalization routines which make GPU matchTemplate
 // consistent with CPU one
@ -95,7 +101,7 @@ float normAcc_SQDIFF(float num, float denum)
 __kernel
 void normalizeKernel_C1_D0
 (
-    __global const TYPE_IMAGE_SQSUM * img_sqsums,
+    __global const float * img_sqsums,
    __global float * res,
    ulong tpl_sqsum,
    int res_rows,
@ -119,8 +125,8 @@ void normalizeKernel_C1_D0
    if(gidx < res_cols && gidy < res_rows)
    {
        float image_sqsum_ = (float)(
-            (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
        res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
    }
 }
@ -152,8 +158,8 @@ void matchTemplate_Prepared_SQDIFF_C1_D0
    if(gidx < res_cols && gidy < res_rows)
    {
        float image_sqsum_ = (float)(
-            (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
        res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
    }
 }
@ -161,7 +167,7 @@ void matchTemplate_Prepared_SQDIFF_C1_D0
 __kernel
 void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
 (
-    __global const TYPE_IMAGE_SQSUM * img_sqsums,
+    __global const float * img_sqsums,
    __global float * res,
    ulong tpl_sqsum,
    int res_rows,
@ -185,10 +191,10 @@ void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
    if(gidx < res_cols && gidy < res_rows)
    {
        float image_sqsum_ = (float)(
-            (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
        res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
-                                        sqrt(image_sqsum_ * tpl_sqsum));
+                                      sqrt(image_sqsum_ * tpl_sqsum));
    }
 }

@ -628,8 +634,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0
    if(gidx < res_cols && gidy < res_rows)
    {
        float sum = (float)(
-            (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+                        (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+                        - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
        res[res_idx] -= sum * tpl_sum;
    }
 }
@ -671,17 +677,17 @@ void matchTemplate_Prepared_CCOFF_C4_D0
    {
        float ccorr = res[res_idx];
        ccorr -= tpl_sum_c0*(float)(
-            (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+                     (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
        ccorr -= tpl_sum_c1*(float)(
-            (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+                     (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
        ccorr -= tpl_sum_c2*(float)(
-            (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+                     (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
        ccorr -= tpl_sum_c3*(float)(
-            (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+                     (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
        res[res_idx] = ccorr;
    }
 }
@ -702,7 +708,7 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
    __global const uint * img_sums,
    int img_sums_offset,
    int img_sums_step,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums,
+    __global const float * img_sqsums,
    int img_sqsums_offset,
    int img_sqsums_step,
    float tpl_sum,
@ -725,12 +731,12 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
    if(gidx < res_cols && gidy < res_rows)
    {
        float image_sum_ =  (float)(
-            (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+                                (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+                                - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));

        float image_sqsum_ = (float)(
-            (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
        res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
                               sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
    }
@ -754,10 +760,10 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
    __global const uint * img_sums_c3,
    int img_sums_offset,
    int img_sums_step,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums_c0,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums_c1,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums_c2,
-    __global const TYPE_IMAGE_SQSUM * img_sqsums_c3,
+    __global const float * img_sqsums_c0,
+    __global const float * img_sqsums_c1,
+    __global const float * img_sqsums_c2,
+    __global const float * img_sqsums_c3,
    int img_sqsums_offset,
    int img_sqsums_step,
    float tpl_sum_c0,
@ -782,42 +788,71 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
    if(gidx < res_cols && gidy < res_rows)
    {
        float image_sum_c0 =  (float)(
-            (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+                                  (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
        float image_sum_c1 =  (float)(
-            (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+                                  (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
        float image_sum_c2 =  (float)(
-            (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+                                  (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
        float image_sum_c3 =  (float)(
-            (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
-          - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+                                  (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));

        float image_sqsum_c0 = (float)(
-            (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
+                                   (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
        float image_sqsum_c1 = (float)(
-            (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
+                                   (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
        float image_sqsum_c2 = (float)(
-            (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
+                                   (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
        float image_sqsum_c3 = (float)(
-            (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
-            (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
+                                   (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));

        float num = res[res_idx] -
-            image_sum_c0 * tpl_sum_c0 -
-            image_sum_c1 * tpl_sum_c1 -
-            image_sum_c2 * tpl_sum_c2 -
-            image_sum_c3 * tpl_sum_c3;
+                    image_sum_c0 * tpl_sum_c0 -
+                    image_sum_c1 * tpl_sum_c1 -
+                    image_sum_c2 * tpl_sum_c2 -
+                    image_sum_c3 * tpl_sum_c3;
        float denum = sqrt( tpl_sqsum * (
-            image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
-            image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
-            image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
-            image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
-            );
+                                image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
+                                image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
+                                image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
+                                image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
+                          );
        res[res_idx] = normAcc(num, denum);
    }
 }
+
+//////////////////////////////////////////////////////////////////////
+// extractFirstChannel
+__kernel
+void extractFirstChannel
+(
+    const __global float4* img,
+    __global float* res,
+    int rows,
+    int cols,
+    int img_offset,
+    int res_offset,
+    int img_step,
+    int res_step
+)
+{
+    img_step   /= sizeof(float4);
+    res_step   /= sizeof(float);
+    img_offset /= sizeof(float4);
+    res_offset /= sizeof(float);
+    img += img_offset;
+    res += res_offset;
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    if(gidx < cols && gidy < rows)
+    {
+        res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x;
+    }
+}
--- a/modules/ocl/src/opencl/moments.cl
+++ b/modules/ocl/src/opencl/moments.cl
@ -6,25 +6,27 @@
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
 typedef double T;
+typedef double F;
+typedef double4 F4;
+#define convert_F4 convert_double4

 #else
-typedef float double;
-typedef float4 double4;
+typedef float F;
+typedef float4 F4;
 typedef long T;
-#define convert_double4 convert_float4
+#define convert_F4 convert_float4
 #endif
-//#pragma OPENCL EXTENSION cl_amd_printf:enable
-//#if defined (DOUBLE_SUPPORT)
-#define DST_ROW_A00     0
-#define DST_ROW_A10     1
-#define DST_ROW_A01     2
-#define DST_ROW_A20     3
-#define DST_ROW_A11     4
-#define DST_ROW_A02     5
-#define DST_ROW_A30     6
-#define DST_ROW_A21     7
-#define DST_ROW_A12     8
-#define DST_ROW_A03     9
+
+#define DST_ROW_00     0
+#define DST_ROW_10     1
+#define DST_ROW_01     2
+#define DST_ROW_20     3
+#define DST_ROW_11     4
+#define DST_ROW_02     5
+#define DST_ROW_30     6
+#define DST_ROW_21     7
+#define DST_ROW_12     8
+#define DST_ROW_03     9

 __kernel void icvContourMoments(int contour_total,
                                __global float* reader_oclmat_data,
@ -60,36 +62,76 @@ __kernel void icvContourMoments(int contour_total,
    yii_1 = yi_1 + yi;

    dst_step /= sizeof(T);
-    *( dst_a + DST_ROW_A00 * dst_step + idx) = dxy;
-    *( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1;
-    *( dst_a + DST_ROW_A01 * dst_step + idx) = dxy * yii_1;
-    *( dst_a + DST_ROW_A20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
-    *( dst_a + DST_ROW_A11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
-    *( dst_a + DST_ROW_A02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
-    *( dst_a + DST_ROW_A30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
-    *( dst_a + DST_ROW_A03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
-    *( dst_a + DST_ROW_A21 * dst_step + idx) =
+    *( dst_a + DST_ROW_00 * dst_step + idx) = dxy;
+    *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1;
+    *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1;
+    *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
+    *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
+    *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
+    *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
+    *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
+    *( dst_a + DST_ROW_21 * dst_step + idx) =
        dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
               xi2 * (yi_1 + 3 * yi));
-    *( dst_a + DST_ROW_A12 * dst_step + idx) =
+    *( dst_a + DST_ROW_12 * dst_step + idx) =
        dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
               yi2 * (xi_1 + 3 * xi));
 }
-//#endif

-//#if defined (DOUBLE_SUPPORT)
+__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE,
+                      __global F* sum, __global F* dst_m, int dst_step)
+{
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int block_y = src_rows/tile_height;
+    int block_x = src_cols/tile_width;
+    int block_num;
+
+    if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
+        block_y ++;
+    if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
+        block_x ++;
+    block_num = block_y * block_x;
+    __local F dst_sum[10][128];
+    if(gidy<128-block_num)
+        for(int i=0; i<10; i++)
+            dst_sum[i][gidy+block_num]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    dst_step /= sizeof(F);
+    if(gidy<block_num)
+    {
+        dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy));
+        dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy));
+        dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy));
+        dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy));
+        dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy));
+        dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy));
+        dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy));
+        dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy));
+        dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy));
+        dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy));
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int lsize=64; lsize>0; lsize>>=1)
+    {
+        if(gidy<lsize)
+        {
+            int lsize2 = gidy + lsize;
+            for(int i=0; i<10; i++)
+                dst_sum[i][gidy] += dst_sum[i][lsize2];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(gidy==0)
+        for(int i=0; i<10; i++)
+            sum[i] = dst_sum[i][0];
+}
+
 __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
-                           __global double* dst_m00,
-                           __global double* dst_m10,
-                           __global double* dst_m01,
-                           __global double* dst_m20,
-                           __global double* dst_m11,
-                           __global double* dst_m02,
-                           __global double* dst_m30,
-                           __global double* dst_m21,
-                           __global double* dst_m12,
-                           __global double* dst_m03,
-                           int dst_cols, int dst_step, int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
+                           int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
 {
    uchar tmp_coi[16]; // get the coi data
    uchar16 tmp[16];
@ -127,7 +169,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
    if( binary )
        for(int i=0; i < tileSize_width; i+=VLEN_C)
            tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
-    double mom[10];
+    F mom[10];
    __local int m[10][128];
    if(lidy == 0)
        for(int i=0; i<10; i++)
@ -197,119 +239,53 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
    if(lidy == 0&&lidx == 0)
    {
        for( int mt = 0; mt < 10; mt++ )
-            mom[mt] = (double)lm[mt];
+            mom[mt] = (F)lm[mt];
        if(binary)
        {
-            double s = 1./255;
+            F s = 1./255;
            for( int mt = 0; mt < 10; mt++ )
                mom[mt] *= s;
        }
-        double xm = x * mom[0], ym = y * mom[0];
+        F xm = x * mom[0], ym = y * mom[0];

        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);

        // + m00 ( = m00' )
-        dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];

        // + m10 ( = m10' + x*m00' )
-        dst_m10[wgidy*dst_cols+wgidx]  = mom[1] + xm;
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;

        // + m01 ( = m01' + y*m00' )
-        dst_m01[wgidy*dst_cols+wgidx]  = mom[2] + ym;
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;

        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);

        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        dst_m11[wgidy*dst_cols+wgidx]  = mom[4] + x * (mom[2] + ym) + y * mom[1];
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];

        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        dst_m02[wgidy*dst_cols+wgidx]  = mom[5] + y * (mom[2] * 2 + ym);
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);

        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));

        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        dst_m21[wgidy*dst_cols+wgidx]  = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];

        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];

        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
    }
 }
-//#endif
-//#if defined (DOUBLE_SUPPORT)
-__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, __global double* sum, __global double* dst_m00,
-                      __global double* dst_m10,
-                      __global double* dst_m01,
-                      __global double* dst_m20,
-                      __global double* dst_m11,
-                      __global double* dst_m02,
-                      __global double* dst_m30,
-                      __global double* dst_m21,
-                      __global double* dst_m12,
-                      __global double* dst_m03)
-{
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int block_y = src_rows/tile_height;
-    int block_x = src_cols/tile_width;
-    int block_num;

-    if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
-        block_y ++;
-    if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
-        block_x ++;
-    block_num = block_y * block_x;
-    __local double dst_sum[10][128];
-    if(gidy<128-block_num)
-        for(int i=0; i<10; i++)
-            dst_sum[i][gidy+block_num]=0;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(gidy<block_num)
-    {
-        dst_sum[0][gidy] = dst_m00[gidy];
-        dst_sum[1][gidy] = dst_m10[gidy];
-        dst_sum[2][gidy] = dst_m01[gidy];
-        dst_sum[3][gidy] = dst_m20[gidy];
-        dst_sum[4][gidy] = dst_m11[gidy];
-        dst_sum[5][gidy] = dst_m02[gidy];
-        dst_sum[6][gidy] = dst_m30[gidy];
-        dst_sum[7][gidy] = dst_m21[gidy];
-        dst_sum[8][gidy] = dst_m12[gidy];
-        dst_sum[9][gidy] = dst_m03[gidy];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int lsize=64; lsize>0; lsize>>=1)
-    {
-        if(gidy<lsize)
-        {
-            int lsize2 = gidy + lsize;
-            for(int i=0; i<10; i++)
-                dst_sum[i][gidy] += dst_sum[i][lsize2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(gidy==0)
-        for(int i=0; i<10; i++)
-            sum[i] = dst_sum[i][0];
-}
-//#endif
-//#if defined (DOUBLE_SUPPORT)
 __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
-                           __global double* dst_m00,
-                           __global double* dst_m10,
-                           __global double* dst_m01,
-                           __global double* dst_m20,
-                           __global double* dst_m11,
-                           __global double* dst_m02,
-                           __global double* dst_m30,
-                           __global double* dst_m21,
-                           __global double* dst_m12,
-                           __global double* dst_m03,
-                           int dst_cols, int dst_step,
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    ushort tmp_coi[8]; // get the coi data
@ -345,7 +321,7 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
    if( binary )
        for(int i=0; i < tileSize_width; i+=VLEN_US)
            tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
-    double mom[10];
+    F mom[10];
    __local long m[10][128];
    if(lidy == 0)
        for(int i=0; i<10; i++)
@ -415,64 +391,55 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
    if(lidy == 0&&lidx == 0)
    {
        for(int mt = 0; mt < 10; mt++ )
-            mom[mt] = (double)lm[mt];
+            mom[mt] = (F)lm[mt];

        if(binary)
        {
-            double s = 1./255;
+            F s = 1./255;
            for( int mt = 0; mt < 10; mt++ )
                mom[mt] *= s;
        }

-        double xm = x  *mom[0], ym = y * mom[0];
+        F xm = x  *mom[0], ym = y * mom[0];

        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);

        // + m00 ( = m00' )
-        dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];

        // + m10 ( = m10' + x*m00' )
-        dst_m10[wgidy*dst_cols+wgidx]  = mom[1] + xm;
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;

        // + m01 ( = m01' + y*m00' )
-        dst_m01[wgidy*dst_cols+wgidx]  = mom[2] + ym;
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;

        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);

        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        dst_m11[wgidy*dst_cols+wgidx]  = mom[4] + x * (mom[2] + ym) + y * mom[1];
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];

        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        dst_m02[wgidy*dst_cols+wgidx]  = mom[5] + y * (mom[2] * 2 + ym);
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);

        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));

        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];

        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];

        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        dst_m03[wgidy*dst_cols+wgidx]  = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
    }
 }
-//#endif
-//#if defined (DOUBLE_SUPPORT)
+
 __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
-                           __global double* dst_m00,
-                           __global double* dst_m10,
-                           __global double* dst_m01,
-                           __global double* dst_m20,
-                           __global double* dst_m11,
-                           __global double* dst_m02,
-                           __global double* dst_m30,
-                           __global double* dst_m21,
-                           __global double* dst_m12,
-                           __global double* dst_m03,
-                           int dst_cols, int dst_step,
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    short tmp_coi[8]; // get the coi data
@ -509,7 +476,7 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
        for(int i=0; i < tileSize_width; i+=(VLEN_S))
            tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;

-    double mom[10];
+    F mom[10];
    __local long m[10][128];
    if(lidy == 0)
        for(int i=0; i<10; i++)
@ -579,64 +546,55 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
    if(lidy ==0 &&lidx ==0)
    {
        for(int mt = 0; mt < 10; mt++ )
-            mom[mt] = (double)lm[mt];
+            mom[mt] = (F)lm[mt];

        if(binary)
        {
-            double s = 1./255;
+            F s = 1./255;
            for( int mt = 0; mt < 10; mt++ )
                mom[mt] *= s;
        }

-        double xm = x * mom[0], ym = y*mom[0];
+        F xm = x * mom[0], ym = y*mom[0];

        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);

        // + m00 ( = m00' )
-        dst_m00[wgidy*dst_cols+wgidx]  = mom[0];
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];

        // + m10 ( = m10' + x*m00' )
-        dst_m10[wgidy*dst_cols+wgidx]  = mom[1] + xm;
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;

        // + m01 ( = m01' + y*m00' )
-        dst_m01[wgidy*dst_cols+wgidx]  = mom[2] + ym;
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;

        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);

        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        dst_m11[wgidy*dst_cols+wgidx]   = mom[4] + x * (mom[2] + ym) + y * mom[1];
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];

        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        dst_m02[wgidy*dst_cols+wgidx]   = mom[5] + y * (mom[2] * 2 + ym);
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);

        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));

        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        dst_m21[wgidy*dst_cols+wgidx]  = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];

        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];

        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        dst_m03[wgidy*dst_cols+wgidx]  = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
    }
 }
-//#endif
-//#if defined (DOUBLE_SUPPORT)
+
 __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
-                            __global double* dst_m00,
-                            __global double* dst_m10,
-                            __global double* dst_m01,
-                            __global double* dst_m20,
-                            __global double* dst_m11,
-                            __global double* dst_m02,
-                            __global double* dst_m30,
-                            __global double* dst_m21,
-                            __global double* dst_m12,
-                            __global double* dst_m03,
-                            int dst_cols, int dst_step,
+                            __global F* dst_m,
+                            int dst_cols, int dst_step, int blocky,
                            int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
 {
    float tmp_coi[4]; // get the coi data
@ -672,23 +630,23 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
    if( binary )
        for(int i=0; i < tileSize_width; i+=4)
            tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
-    double mom[10];
-    __local double m[10][128];
+    F mom[10];
+    __local F m[10][128];
    if(lidy == 0)
        for(int i = 0; i < 10; i ++)
            for(int j = 0; j < 128; j ++)
                m[i][j] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
-    double lm[10] = {0};
-    double4 x0 = (double4)(0);
-    double4 x1 = (double4)(0);
-    double4 x2 = (double4)(0);
-    double4 x3 = (double4)(0);
+    F lm[10] = {0};
+    F4 x0 = (F4)(0);
+    F4 x1 = (F4)(0);
+    F4 x2 = (F4)(0);
+    F4 x3 = (F4)(0);
    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
    {
-        double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
-        double4 p = convert_double4(tmp[xt/VLEN_F]);
-        double4 xp = v_xt * p, xxp = xp * v_xt;
+        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
+        F4 p = convert_F4(tmp[xt/VLEN_F]);
+        F4 xp = v_xt * p, xxp = xp * v_xt;
        x0 += p;
        x1 += xp;
        x2 += xxp;
@ -698,178 +656,14 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
    x1.s0 += x1.s1 + x1.s2 + x1.s3;
    x2.s0 += x2.s1 + x2.s2 + x2.s3;
    x3.s0 += x3.s1 + x3.s2 + x3.s3;
-/*
-    double py = lidy * x0.s0, sy = lidy*lidy;
+
+    F py = lidy * x0.s0, sy = lidy*lidy;
    int bheight = min(tileSize_height, TILE_SIZE/2);
    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
    {
-        m[9][lidy-bheight] = ((double)py) * sy;  // m03
-        m[8][lidy-bheight] = ((double)x1.s0) * sy;  // m12
-        m[7][lidy-bheight] = ((double)x2.s0) * lidy;  // m21
-        m[6][lidy-bheight] = x3.s0;             // m30
-        m[5][lidy-bheight] = x0.s0 * sy;        // m02
-        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
-        m[3][lidy-bheight] = x2.s0;             // m20
-        m[2][lidy-bheight] = py;             // m01
-        m[1][lidy-bheight] = x1.s0;             // m10
-        m[0][lidy-bheight] = x0.s0;             // m00
-    }
-    else if(lidy < bheight)
-    {
-        lm[9] = ((double)py) * sy;  // m03
-        lm[8] = ((double)x1.s0) * sy;  // m12
-        lm[7] = ((double)x2.s0) * lidy;  // m21
-        lm[6] = x3.s0;             // m30
-        lm[5] = x0.s0 * sy;        // m02
-        lm[4] = x1.s0 * lidy;         // m11
-        lm[3] = x2.s0;             // m20
-        lm[2] = py;             // m01
-        lm[1] = x1.s0;             // m10
-        lm[0] = x0.s0;             // m00
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
-    {
-        if(lidy < j)
-            for( int i = 0; i < 10; i++ )
-                lm[i] = lm[i] + m[i][lidy];
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lidy >= j/2&&lidy < j)
-            for( int i = 0; i < 10; i++ )
-                m[i][lidy-j/2] = lm[i];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lidy == 0&&lidx == 0)
-    {
-        for(int mt = 0; mt < 10; mt++ )
-            mom[mt] = (double)lm[mt];
-
-        if(binary)
-        {
-            double s = 1./255;
-            for( int mt = 0; mt < 10; mt++ )
-                mom[mt] *= s;
-        }
-
-        double xm = x * mom[0], ym = y * mom[0];
-
-        // accumulate moments computed in each tile
-
-        // + m00 ( = m00' )
-        dst_m00[wgidy*dst_cols+wgidx]= mom[0];
-
-        // + m10 ( = m10' + x*m00' )
-        dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
-
-        // + m01 ( = m01' + y*m00' )
-        dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
-
-        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
-
-        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
-        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        dst_m02[wgidy*dst_cols+wgidx]= mom[5] + y * (mom[2] * 2 + ym);
-
-        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        dst_m30[wgidy*dst_cols+wgidx]= mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
-        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
-        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
-        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        dst_m03[wgidy*dst_cols+wgidx]= mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
-    }*/
-}
-//#endif
-//#if defined (DOUBLE_SUPPORT)
-__kernel void CvMoments_D6(__global double* src_data,  int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
-                           __global double* dst_m00,
-                           __global double* dst_m10,
-                           __global double* dst_m01,
-                           __global double* dst_m20,
-                           __global double* dst_m11,
-                           __global double* dst_m02,
-                           __global double* dst_m30,
-                           __global double* dst_m21,
-                           __global double* dst_m12,
-                           __global double* dst_m03,
-                           int dst_cols, int dst_step,
-                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
-    double tmp_coi[4]; // get the coi data
-    double4 tmp[64];
-    int VLEN_D = 4; // length of vetor
-    int gidy = get_global_id(0);
-    int gidx = get_global_id(1);
-    int wgidy = get_group_id(0);
-    int wgidx = get_group_id(1);
-    int lidy = get_local_id(0);
-    int lidx = get_local_id(1);
-    int y = wgidy*TILE_SIZE;  // real Y index of pixel
-    int x = wgidx*TILE_SIZE;  // real X index of pixel
-    int kcn = (cn==2)?2:4;
-    int rstep = min(src_step/8, TILE_SIZE);
-    tileSize_height = min(TILE_SIZE,  src_rows - y);
-    tileSize_width = min(TILE_SIZE, src_cols - x);
-
-    if(tileSize_width < TILE_SIZE)
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global double*)src_data+(y+lidy)*src_step/8+x+i) = 0;
-    if( coi > 0 )
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-        {
-            for(int j=0; j<4; j++)
-                tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
-            tmp[i/VLEN_D] = (double4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
-        }
-    else
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-            tmp[i/VLEN_D] = (double4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
-    double4 zero = (double4)(0);
-    double4 full = (double4)(255);
-    if( binary )
-        for(int i=0; i < tileSize_width; i+=VLEN_D)
-            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
-    double mom[10];
-    __local double m[10][128];
-    if(lidy == 0)
-        for(int i=0; i<10; i++)
-            for(int j=0; j<128; j++)
-                m[i][j]=0;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    double lm[10] = {0};
-    double4 x0 = (double4)(0);
-    double4 x1 = (double4)(0);
-    double4 x2 = (double4)(0);
-    double4 x3 = (double4)(0);
-    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
-    {
-        double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
-        double4 p = tmp[xt/VLEN_D];
-        double4 xp = v_xt * p, xxp = xp * v_xt;
-        x0 += p;
-        x1 += xp;
-        x2 += xxp;
-        x3 += xxp *v_xt;
-    }
-    x0.s0 += x0.s1 + x0.s2 + x0.s3;
-    x1.s0 += x1.s1 + x1.s2 + x1.s3;
-    x2.s0 += x2.s1 + x2.s2 + x2.s3;
-    x3.s0 += x3.s1 + x3.s2 + x3.s3;
-
-    double py = lidy * x0.s0, sy = lidy*lidy;
-    int bheight = min(tileSize_height, TILE_SIZE/2);
-    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
-    {
-        m[9][lidy-bheight] = ((double)py) * sy;  // m03
-        m[8][lidy-bheight] = ((double)x1.s0) * sy;  // m12
-        m[7][lidy-bheight] = ((double)x2.s0) * lidy;  // m21
+        m[9][lidy-bheight] = ((F)py) * sy;  // m03
+        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
        m[6][lidy-bheight] = x3.s0;             // m30
        m[5][lidy-bheight] = x0.s0 * sy;        // m02
        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
@ -881,9 +675,9 @@ __kernel void CvMoments_D6(__global double* src_data,  int src_rows, int src_col

    else if(lidy < bheight)
    {
-        lm[9] = ((double)py) * sy;  // m03
-        lm[8] = ((double)x1.s0) * sy;  // m12
-        lm[7] = ((double)x2.s0) * lidy;  // m21
+        lm[9] = ((F)py) * sy;  // m03
+        lm[8] = ((F)x1.s0) * sy;  // m12
+        lm[7] = ((F)x2.s0) * lidy;  // m21
        lm[6] = x3.s0;             // m30
        lm[5] = x0.s0 * sy;        // m02
        lm[4] = x1.s0 * lidy;         // m11
@ -907,47 +701,202 @@ __kernel void CvMoments_D6(__global double* src_data,  int src_rows, int src_col
    if(lidy == 0&&lidx == 0)
    {
        for( int mt = 0; mt < 10; mt++ )
-            mom[mt] = (double)lm[mt];
+            mom[mt] = (F)lm[mt];
        if(binary)
        {
-            double s = 1./255;
+            F s = 1./255;
            for( int mt = 0; mt < 10; mt++ )
                mom[mt] *= s;
        }

-        double xm = x * mom[0], ym = y * mom[0];
+        F xm = x * mom[0], ym = y * mom[0];

        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);

        // + m00 ( = m00' )
-        dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];

        // + m10 ( = m10' + x*m00' )
-        dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;

        // + m01 ( = m01' + y*m00' )
-        dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;

        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);

        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        dst_m11[wgidy*dst_cols+wgidx]  = mom[4] + x * (mom[2] + ym) + y * mom[1];
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];

        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        dst_m02[wgidy*dst_cols+wgidx]  = mom[5] + y * (mom[2] * 2 + ym);
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);

        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));

        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        dst_m21[wgidy*dst_cols+wgidx]  = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];

        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];

        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        dst_m03[wgidy*dst_cols+wgidx]  = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
    }
 }
-//#endif
+
+__kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
+                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    F tmp_coi[4]; // get the coi data
+    F4 tmp[64];
+    int VLEN_D = 4; // length of vetor
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/8, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE,  src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if(tileSize_width < TILE_SIZE)
+        for(int i = tileSize_width; i < rstep; i++ )
+            *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
+    if( coi > 0 )
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+        {
+            for(int j=0; j<4; j++)
+                tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+            tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
+    F4 zero = (F4)(0);
+    F4 full = (F4)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
+    F mom[10];
+    __local F m[10][128];
+    if(lidy == 0)
+        for(int i=0; i<10; i++)
+            for(int j=0; j<128; j++)
+                m[i][j]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    F lm[10] = {0};
+    F4 x0 = (F4)(0);
+    F4 x1 = (F4)(0);
+    F4 x2 = (F4)(0);
+    F4 x3 = (F4)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
+    {
+        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
+        F4 p = tmp[xt/VLEN_D];
+        F4 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp *v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3;
+
+    F py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((F)py) * sy;  // m03
+        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+
+    else if(lidy < bheight)
+    {
+        lm[9] = ((F)py) * sy;  // m03
+        lm[8] = ((F)x1.s0) * sy;  // m12
+        lm[7] = ((F)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (F)lm[mt];
+        if(binary)
+        {
+            F s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        F xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);
+
+        // + m00 ( = m00' )
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
--- a/modules/ocl/src/opencl/pyr_up.cl
+++ b/modules/ocl/src/opencl/pyr_up.cl
@ -16,6 +16,8 @@
 //
 // @Authors
 //    Zhang Chunpeng	chunpeng@multicorewareinc.com
+//    Dachuan Zhao, dachuan@multicorewareinc.com
+//    Yao Wang, yao@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@ -53,20 +55,22 @@ uchar get_valid_uchar(uchar data)
 //////////////////////////  CV_8UC1  //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
-                int srcRows,int dstRows,int srcCols,int dstCols,
-                int srcOffset,int dstOffset,int srcStep,int dstStep)
+                          int srcRows,int dstRows,int srcCols,int dstCols,
+                          int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
    const int x = get_global_id(0);
    const int y = get_global_id(1);
-
    __local float s_srcPatch[10][10];
    __local float s_dstPatch[20][16];
+    const int tidx = get_local_id(0);
+    const int tidy = get_local_id(1);
+    const int lsizex = get_local_size(0);
+    const int lsizey = get_local_size(1);

-
-    if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+    if( tidx < 10 && tidy < 10 )
    {
-        int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
-        int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+        int srcx = mad24((int)get_group_id(0), (lsizex>>1), tidx) - 1;
+        int srcy = mad24((int)get_group_id(1), (lsizey>>1), tidy) - 1;

        srcx = abs(srcx);
        srcx = min(srcCols - 1,srcx);
@ -74,25 +78,24 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
        srcy = abs(srcy);
        srcy = min(srcRows -1 ,srcy);

-        s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+        s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]);

    }

    barrier(CLK_LOCAL_MEM_FENCE);

    float sum = 0;
-    const int evenFlag = (int)((get_local_id(0) & 1) == 0);
-    const int oddFlag = (int)((get_local_id(0) & 1) != 0);
-    const bool  eveny = ((get_local_id(1) & 1) == 0);
-    const int tidx = get_local_id(0);
+    const int evenFlag = (int)((tidx & 1) == 0);
+    const int oddFlag = (int)((tidx & 1) != 0);
+    const bool  eveny = ((tidy & 1) == 0);

    if(eveny)
    {
-        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
-        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
-        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
+        sum = (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
    }

    s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
@ -103,42 +106,40 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,

        if (eveny)
        {
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-         }
+            sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)];
+        }

-         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+    }

-     if (get_local_id(1) > 13)
-     {
+    if (get_local_id(1) > 13)
+    {
        sum = 0;

        if (eveny)
        {
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+            sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)];
            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
        }
-        s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[4 + tidy][tidx] = sum;
+    }

    barrier(CLK_LOCAL_MEM_FENCE);

    sum = 0;

-    const int tidy = get_local_id(1);
-
-    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
-    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
-    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+    sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][tidx];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][tidx];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][tidx];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];

    if ((x < dstCols) && (y < dstRows))
        dst[x + y * dstStep] = (float)(4.0f * sum);
@ -149,8 +150,8 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
 //////////////////////////  CV_16UC1  /////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
-                int srcRows,int dstRows,int srcCols,int dstCols,
-                int srcOffset,int dstOffset,int srcStep,int dstStep)
+                          int srcRows,int dstRows,int srcCols,int dstCols,
+                          int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
    const int x = get_global_id(0);
    const int y = get_global_id(1);
@ -210,13 +211,13 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-         }
+        }

-         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+    }

-     if (get_local_id(1) > 13)
-     {
+    if (get_local_id(1) > 13)
+    {
        sum = 0;

        if (eveny)
@ -228,7 +229,7 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
        }
        s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
-     }
+    }

    barrier(CLK_LOCAL_MEM_FENCE);

@ -251,12 +252,15 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
 //////////////////////////  CV_32FC1  /////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
-                int srcRows,int dstRows,int srcCols,int dstCols,
-                int srcOffset,int dstOffset,int srcStep,int dstStep)
+                          int srcRows,int dstRows,int srcCols,int dstCols,
+                          int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
    const int x = get_global_id(0);
    const int y = get_global_id(1);
-
+    const int tidx = get_local_id(0);
+    const int tidy = get_local_id(1);
+    const int lsizex = get_local_size(0);
+    const int lsizey = get_local_size(1);
    __local float s_srcPatch[10][10];
    __local float s_dstPatch[20][16];

@ -266,10 +270,10 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
    dstStep = dstStep >> 2;


-    if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+    if( tidx < 10 && tidy < 10 )
    {
-        int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
-        int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+        int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1;
+        int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1;

        srcx = abs(srcx);
        srcx = min(srcCols - 1,srcx);
@ -277,71 +281,67 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
        srcy = abs(srcy);
        srcy = min(srcRows -1 ,srcy);

-        s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
+        s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]);

    }

    barrier(CLK_LOCAL_MEM_FENCE);

    float sum = 0;
-    const int evenFlag = (int)((get_local_id(0) & 1) == 0);
-    const int oddFlag = (int)((get_local_id(0) & 1) != 0);
-    const bool  eveny = ((get_local_id(1) & 1) == 0);
-    const int tidx = get_local_id(0);
+    const int evenFlag = (int)((tidx & 1) == 0);
+    const int oddFlag = (int)((tidx & 1) != 0);
+    const bool  eveny = ((tidy & 1) == 0);
+

    if(eveny)
    {
-        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
-        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
-        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
    }

-    s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+    s_dstPatch[2 + tidy][tidx] = sum;

-    if (get_local_id(1) < 2)
+    if (tidy < 2)
    {
        sum = 0;

        if (eveny)
        {
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-         }
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)];
+        }

-         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[tidy][tidx] = sum;
+    }

-     if (get_local_id(1) > 13)
-     {
+    if (tidy > 13)
+    {
        sum = 0;

        if (eveny)
        {
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx + 2) >> 1)];
        }
-        s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[4 + tidy][tidx] = sum;
+    }

    barrier(CLK_LOCAL_MEM_FENCE);

-    sum = 0;
-
-    const int tidy = get_local_id(1);
-
-    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
-    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
-    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
+    sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][tidx];
+    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][tidx];
+    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][tidx];
+    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];

    if ((x < dstCols) && (y < dstRows))
        dst[x + y * dstStep] = (float)(4.0f * sum);
@ -376,37 +376,16 @@ uchar4 convert_float4_to_uchar4(float4 data)
    return u4Data;
 }

-float4 int_x_float4(int leftOpr,float4 rightOpr)
-{
-    float4 result = {0,0,0,0};
-
-    result.x = rightOpr.x * leftOpr;
-    result.y = rightOpr.y * leftOpr;
-    result.z = rightOpr.z * leftOpr;
-    result.w = rightOpr.w * leftOpr;
-
-    return result;
-}
-
-float4 float4_x_float4(float4 leftOpr,float4 rightOpr)
-{
-    float4 result;
-
-    result.x = leftOpr.x * rightOpr.x;
-    result.y = leftOpr.y * rightOpr.y;
-    result.z = leftOpr.z * rightOpr.z;
-    result.w = leftOpr.w * rightOpr.w;
-
-    return result;
-}
-
 __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
-                int srcRows,int dstRows,int srcCols,int dstCols,
-                int srcOffset,int dstOffset,int srcStep,int dstStep)
+                          int srcRows,int dstRows,int srcCols,int dstCols,
+                          int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
    const int x = get_global_id(0);
    const int y = get_global_id(1);
-
+    const int tidx = get_local_id(0);
+    const int tidy = get_local_id(1);
+    const int lsizex = get_local_size(0);
+    const int lsizey = get_local_size(1);
    __local float4 s_srcPatch[10][10];
    __local float4 s_dstPatch[20][16];

@ -416,10 +395,10 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
    dstStep >>= 2;


-    if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+    if( tidx < 10 && tidy < 10 )
    {
-        int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
-        int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+        int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1;
+        int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1;

        srcx = abs(srcx);
        srcx = min(srcCols - 1,srcx);
@ -427,17 +406,16 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
        srcy = abs(srcy);
        srcy = min(srcRows -1 ,srcy);

-        s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
+        s_srcPatch[tidy][tidx] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    float4 sum = (float4)(0,0,0,0);

-    const int evenFlag = (int)((get_local_id(0) & 1) == 0);
-    const int oddFlag = (int)((get_local_id(0) & 1) != 0);
-    const bool  eveny = ((get_local_id(1) & 1) == 0);
-    const int tidx = get_local_id(0);
+    const int evenFlag = (int)((tidx & 1) == 0);
+    const int oddFlag = (int)((tidx & 1) != 0);
+    const bool  eveny = ((tidy & 1) == 0);

    float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
    float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
@ -446,63 +424,59 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,

    if(eveny)
    {
-        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);
+        sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + ( evenFlag * co1) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];

    }

-    s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+    s_dstPatch[2 + tidy][tidx] = sum;

-    if (get_local_id(1) < 2)
+    if (tidy < 2)
    {
        sum = 0;

        if (eveny)
        {
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
-         }
+            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-16][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
+        }

-         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[tidy][tidx] = sum;
+    }

-     if (get_local_id(1) > 13)
-     {
+    if (tidy > 13)
+    {
        sum = 0;

        if (eveny)
        {
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)];

        }
-        s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[4 + tidy][tidx] = sum;
+    }

    barrier(CLK_LOCAL_MEM_FENCE);

-    sum = 0;
-
-    const int tidy = get_local_id(1);
-
-    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
-    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
-    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
-    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
-    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+    sum = co3 * s_dstPatch[2 + tidy - 2][tidx];
+    sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx];
+    sum = sum + co1 * s_dstPatch[2 + tidy    ][tidx];
+    sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx];
+    sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];

    if ((x < dstCols) && (y < dstRows))
    {
-        dst[x + y * dstStep] = convert_float4_to_uchar4(int_x_float4(4.0f,sum));
+        dst[x + y * dstStep] = convert_float4_to_uchar4(4.0f * sum);
    }
 }
 ///////////////////////////////////////////////////////////////////////
@ -535,8 +509,8 @@ ushort4 convert_float4_to_ushort4(float4 data)


 __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
-                int srcRows,int dstRows,int srcCols,int dstCols,
-                int srcOffset,int dstOffset,int srcStep,int dstStep)
+                          int srcRows,int dstRows,int srcCols,int dstCols,
+                          int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
    const int x = get_global_id(0);
    const int y = get_global_id(1);
@ -580,11 +554,11 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,

    if(eveny)
    {
-        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);
+        sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];

    }

@ -596,31 +570,31 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,

        if (eveny)
        {
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
-         }
+            sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * co2  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * co2  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+        }

-         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
+    }

-     if (get_local_id(1) > 13)
-     {
+    if (get_local_id(1) > 13)
+    {
        sum = 0;

        if (eveny)
        {
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+            sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];

        }
        s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
-     }
+    }

    barrier(CLK_LOCAL_MEM_FENCE);

@ -628,15 +602,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,

    const int tidy = get_local_id(1);

-    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
-    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
-    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
-    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
-    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+    sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)];
+    sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)];
+    sum = sum + co1 * s_dstPatch[2 + tidy    ][get_local_id(0)];
+    sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)];
+    sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)];

    if ((x < dstCols) && (y < dstRows))
    {
-        dst[x + y * dstStep] = convert_float4_to_ushort4(int_x_float4(4.0f,sum));
+        dst[x + y * dstStep] = convert_float4_to_ushort4(4.0f * sum);
    }
 }

@ -644,12 +618,15 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
 //////////////////////////  CV_32FC4 //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
-                int srcRows,int dstRows,int srcCols,int dstCols,
-                int srcOffset,int dstOffset,int srcStep,int dstStep)
+                          int srcRows,int dstRows,int srcCols,int dstCols,
+                          int srcOffset,int dstOffset,int srcStep,int dstStep)
 {
    const int x = get_global_id(0);
    const int y = get_global_id(1);
-
+    const int tidx = get_local_id(0);
+    const int tidy = get_local_id(1);
+    const int lsizex = get_local_size(0);
+    const int lsizey = get_local_size(1);
    __local float4 s_srcPatch[10][10];
    __local float4 s_dstPatch[20][16];

@ -659,10 +636,10 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
    dstStep >>= 4;


-    if( get_local_id(0) < 10 && get_local_id(1) < 10 )
+    if( tidx < 10 && tidy < 10 )
    {
-        int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
-        int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
+        int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + tidx) - 1;
+        int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + tidy) - 1;

        srcx = abs(srcx);
        srcx = min(srcCols - 1,srcx);
@ -670,17 +647,16 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
        srcy = abs(srcy);
        srcy = min(srcRows -1 ,srcy);

-        s_srcPatch[get_local_id(1)][get_local_id(0)] = (float4)(src[srcx + srcy * srcStep]);
+        s_srcPatch[tidy][tidx] = (float4)(src[srcx + srcy * srcStep]);
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    float4 sum = (float4)(0,0,0,0);

-    const int evenFlag = (int)((get_local_id(0) & 1) == 0);
-    const int oddFlag = (int)((get_local_id(0) & 1) != 0);
-    const bool  eveny = ((get_local_id(1) & 1) == 0);
-    const int tidx = get_local_id(0);
+    const int evenFlag = (int)((tidx & 1) == 0);
+    const int oddFlag = (int)((tidx & 1) != 0);
+    const bool  eveny = ((tidy & 1) == 0);

    float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
    float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
@ -689,59 +665,55 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,

    if(eveny)
    {
-        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( evenFlag, co1 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( oddFlag , co2 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]);
-        sum = sum + float4_x_float4(int_x_float4( evenFlag, co3 ) , s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]);
+        sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
+        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
+        sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx    ) >> 1)];
+        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
+        sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];

    }

-    s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
+    s_dstPatch[2 + tidy][tidx] = sum;

-    if (get_local_id(1) < 2)
+    if (tidy < 2)
    {
        sum = 0;

        if (eveny)
        {
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx - 2) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx - 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1 ) , s_srcPatch[0][1 + ((tidx    ) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2  ) , s_srcPatch[0][1 + ((tidx + 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[0][1 + ((tidx + 2) >> 1)]);
-         }
+            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * co2  ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * co2  ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
+        }

-         s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[tidy][tidx] = sum;
+    }

-     if (get_local_id(1) > 13)
-     {
+    if (tidy > 13)
+    {
        sum = 0;

        if (eveny)
        {
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx - 2) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx - 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co1) , s_srcPatch[9][1 + ((tidx    ) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4( oddFlag , co2) , s_srcPatch[9][1 + ((tidx + 1) >> 1)]);
-            sum = sum + float4_x_float4(int_x_float4(evenFlag , co3) , s_srcPatch[9][1 + ((tidx + 2) >> 1)]);
+            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)];
+            sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx    ) >> 1)];
+            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)];
+            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)];

        }
-        s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
-     }
+        s_dstPatch[4 + tidy][tidx] = sum;
+    }

    barrier(CLK_LOCAL_MEM_FENCE);

-    sum = 0;
-
-    const int tidy = get_local_id(1);
-
-    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy - 2][get_local_id(0)]);
-    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy - 1][get_local_id(0)]);
-    sum = sum + float4_x_float4(co1 , s_dstPatch[2 + tidy    ][get_local_id(0)]);
-    sum = sum + float4_x_float4(co2 , s_dstPatch[2 + tidy + 1][get_local_id(0)]);
-    sum = sum + float4_x_float4(co3 , s_dstPatch[2 + tidy + 2][get_local_id(0)]);
+    sum = co3 * s_dstPatch[2 + tidy - 2][tidx];
+    sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx];
+    sum = sum + co1 * s_dstPatch[2 + tidy    ][tidx];
+    sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx];
+    sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];

    if ((x < dstCols) && (y < dstRows))
    {
--- a/modules/ocl/src/opencl/stereobm.cl
+++ b/modules/ocl/src/opencl/stereobm.cl
@ -323,7 +323,7 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
    float conv = 0;
    int y1 = y==0? 0 : y-1;
    int x1 = x==0? 0 : x-1;
-    if(x < cols && y < rows)
+    if(x < cols && y < rows && x > 0 && y > 0)
    {
        conv = (float)input[(y1)  * cols + (x1)] * (-1) + (float)input[(y1)  * cols + (x+1)] * (1) +
               (float)input[(y)   * cols + (x1)] * (-2) + (float)input[(y)   * cols + (x+1)] * (2) +
--- a/modules/ocl/test/test_brute_force_matcher.cpp
+++ b/modules/ocl/test/test_brute_force_matcher.cpp
@ -110,7 +110,7 @@ namespace
        }
    };

-    TEST_P(BruteForceMatcher, DISABLED_Match_Single)
+    TEST_P(BruteForceMatcher, Match_Single)
    {
        cv::ocl::BruteForceMatcher_OCL_base matcher(distType);

@ -130,7 +130,7 @@ namespace
        ASSERT_EQ(0, badCount);
    }

-    TEST_P(BruteForceMatcher, DISABLED_KnnMatch_2_Single)
+    TEST_P(BruteForceMatcher, KnnMatch_2_Single)
    {
        const int knn = 2;

--- a/modules/ocl/test/test_match_template.cpp
+++ b/modules/ocl/test/test_match_template.cpp
@ -75,7 +75,7 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho
    }
 };

-TEST_P(MatchTemplate8U, DISABLED_Accuracy)
+TEST_P(MatchTemplate8U, Accuracy)
 {

    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
@ -138,18 +138,18 @@ TEST_P(MatchTemplate32F, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss);
 }

-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate8U,
                        testing::Combine(
                            MTEMP_SIZES,
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
                            testing::Values(Channels(1), Channels(3), Channels(4)),
                            ALL_TEMPLATE_METHODS
                        )
                       );

-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate32F, testing::Combine(
                            MTEMP_SIZES,
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
                            testing::Values(Channels(1), Channels(3), Channels(4)),
                            testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
 #endif