Merge branch '2.4'

2013-02-28 11:10:40 +04:00
parent c5e979ceb7 2be893a21e
commit 891d7da6ee
39 changed files with 6898 additions and 386 deletions
--- a/modules/ocl/src/fft.cpp
+++ b/modules/ocl/src/fft.cpp
@@ -53,6 +53,10 @@ void cv::ocl::dft(const oclMat&, oclMat&, Size, int)
 {
    CV_Error(CV_StsNotImplemented, "OpenCL DFT is not implemented");
 }
+namespace cv { namespace ocl {
+    void fft_teardown();
+}}
+void cv::ocl::fft_teardown(){}
 #else
 #include "clAmdFft.h"
 namespace cv
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -55,13 +55,16 @@ using namespace cv::ocl;

 //#define PRINT_KERNEL_RUN_TIME
 #define RUN_TIMES 100
-
+#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
+#define CL_MEM_USE_PERSISTENT_MEM_AMD 0
+#endif
 //#define AMD_DOUBLE_DIFFER

 namespace cv
 {
    namespace ocl
    {
+        extern void fft_teardown();
        /*
         * The binary caching system to eliminate redundant program source compilation.
         * Strictly, this is not a cache because we do not implement evictions right now.
@@ -69,6 +72,15 @@ namespace cv
         */
        std::auto_ptr<ProgramCache> ProgramCache::programCache;
        ProgramCache *programCache = NULL;
+        DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
+        DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
+        int gDevMemTypeValueMap[5] = {0,
+                                      CL_MEM_ALLOC_HOST_PTR,
+                                      CL_MEM_USE_HOST_PTR,
+                                      CL_MEM_COPY_HOST_PTR,
+                                      CL_MEM_USE_PERSISTENT_MEM_AMD};
+        int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
+
        ProgramCache::ProgramCache()
        {
            codeCache.clear();
@@ -110,30 +122,25 @@ namespace cv
        }

        ////////////////////////Common OpenCL specific calls///////////////
-        //Info::Info()
-        //{
-        //	oclplatform = 0;
-        //	oclcontext = 0;
-        //	devnum = 0;
-        //}
-        //Info::~Info()
-        //{
-        //	release();
-        //}
-        //void Info::release()
-        //{
-        //	if(oclplatform)
-        //	{
-        //		oclplatform = 0;
-        //	}
-        //	if(oclcontext)
-        //	{
-        //		openCLSafeCall(clReleaseContext(oclcontext));
-        //	}
-        //	devices.empty();
-        //	devName.empty();
-        //}
-        struct Info::Impl
+        int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
+        {
+            rw_type = gDeviceMemRW;
+            mem_type = gDeviceMemType;
+            return Context::getContext()->impl->unified_memory;
+        }
+
+        int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
+        {
+            if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
+                 mem_type == DEVICE_MEM_UHP ||
+                 mem_type == DEVICE_MEM_CHP )
+                return -1;
+            gDeviceMemRW = rw_type;
+            gDeviceMemType = mem_type;
+            return 0;
+        }
+
+       struct Info::Impl
        {
            cl_platform_id oclplatform;
            std::vector<cl_device_id> devices;
@@ -287,11 +294,8 @@ namespace cv
         }

        void *getoclContext()
-
        {
-
            return &(Context::getContext()->impl->clContext);
-
        }

        void *getoclCommandQueue()
@@ -316,10 +320,16 @@ namespace cv

        void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
                               size_t widthInBytes, size_t height)
+        {
+            openCLMallocPitchEx(clCxt, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
+        }
+
+        void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
+                               size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
        {
            cl_int status;

-            *dev_ptr = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE,
+            *dev_ptr = clCreateBuffer(clCxt->impl->clContext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
                                      widthInBytes * height, 0, &status);
            openCLVerifyCall(status);
            *pitch = widthInBytes;
@@ -834,6 +844,11 @@ namespace cv
            clcxt->impl->double_support = oclinfo.impl->double_support;
            //extra options to recognize compiler options
            memcpy(clcxt->impl->extra_options, oclinfo.impl->extra_options, 512);
+            cl_bool unfymem = false;
+            openCLSafeCall(clGetDeviceInfo(clcxt->impl->devices, CL_DEVICE_HOST_UNIFIED_MEMORY,
+                                           sizeof(cl_bool), (void *)&unfymem, NULL));
+            if(unfymem)
+                clcxt->impl->unified_memory = 1;
        }
        Context::Context()
        {
@@ -850,6 +865,7 @@ namespace cv
            impl->double_support = 0;
            //extra options to recognize vendor specific fp64 extensions
            memset(impl->extra_options, 0, 512);
+            impl->unified_memory = 0;
            programCache = ProgramCache::getProgramCache();
        }

@@ -874,6 +890,7 @@ namespace cv
        }
        void Info::release()
        {
+            fft_teardown();
            if(impl->oclplatform)
            {
                impl->oclplatform = 0;
--- a/modules/ocl/src/interpolate_frames.cpp
+++ b/modules/ocl/src/interpolate_frames.cpp
@@ -45,6 +45,7 @@

 #include <iomanip>
 #include "precomp.hpp"
+#include "mcwutil.hpp"

 using namespace cv;
 using namespace cv::ocl;
@@ -230,73 +231,10 @@ void interpolate::blendFrames(const oclMat &frame0, const oclMat &/*frame1*/, co

 void interpolate::bindImgTex(const oclMat &img, cl_mem &texture)
 {
-    cl_image_format format;
-    int err;
-    int depth    = img.depth();
-    int channels = img.channels();
-
-    switch(depth)
-    {
-    case CV_8U:
-        format.image_channel_data_type = CL_UNSIGNED_INT8;
-        break;
-    case CV_32S:
-        format.image_channel_data_type = CL_UNSIGNED_INT32;
-        break;
-    case CV_32F:
-        format.image_channel_data_type = CL_FLOAT;
-        break;
-    default:
-        throw std::exception();
-        break;
-    }
-    switch(channels)
-    {
-    case 1:
-        format.image_channel_order     = CL_R;
-        break;
-    case 3:
-        format.image_channel_order     = CL_RGB;
-        break;
-    case 4:
-        format.image_channel_order     = CL_RGBA;
-        break;
-    default:
-        throw std::exception();
-        break;
-    }
    if(texture)
    {
        openCLFree(texture);
    }
-
-#ifdef CL_VERSION_1_2
-    cl_image_desc desc;
-    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
-    desc.image_width      = img.step / img.elemSize();
-    desc.image_height     = img.rows;
-    desc.image_depth      = 0;
-    desc.image_array_size = 1;
-    desc.image_row_pitch  = 0;
-    desc.image_slice_pitch = 0;
-    desc.buffer           = NULL;
-    desc.num_mip_levels   = 0;
-    desc.num_samples      = 0;
-    texture = clCreateImage(Context::getContext()->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
-#else
-    texture = clCreateImage2D(
-                  Context::getContext()->impl->clContext,
-                  CL_MEM_READ_WRITE,
-                  &format,
-                  img.step / img.elemSize(),
-                  img.rows,
-                  0,
-                  NULL,
-                  &err);
-#endif
-    size_t origin[] = { 0, 0, 0 };
-    size_t region[] = { img.step / img.elemSize(), img.rows, 1 };
-    clEnqueueCopyBufferToImage(img.clCxt->impl->clCmdQueue, (cl_mem)img.data, texture, 0, origin, region, 0, NULL, 0);
-    openCLSafeCall(err);
+    texture = bindTexture(img);
 }

--- a/modules/ocl/src/kernels/cvt_color.cl
+++ b/modules/ocl/src/kernels/cvt_color.cl
@@ -203,8 +203,8 @@ __kernel void YUV2RGB(int cols,int rows,int src_step,int dst_step,int channels,

 __constant int ITUR_BT_601_CY = 1220542;
 __constant int ITUR_BT_601_CUB = 2116026;
-__constant int ITUR_BT_601_CUG = -409993;
-__constant int ITUR_BT_601_CVG = -852492;
+__constant int ITUR_BT_601_CUG = 409993;
+__constant int ITUR_BT_601_CVG = 852492;
 __constant int ITUR_BT_601_CVR = 1673527;
 __constant int ITUR_BT_601_SHIFT = 20;

@@ -229,7 +229,7 @@ __kernel void YUV2RGBA_NV12(int cols,int rows,int src_step,int dst_step,
        int V  = usrc[1] - 128;

        int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
-        int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * V + ITUR_BT_601_CUG * U;
+        int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
        int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;

        Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
--- a/modules/ocl/src/kernels/moments.cl
+++ b/modules/ocl/src/kernels/moments.cl
@@ -0,0 +1,938 @@
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#else
+typedef float double;
+typedef float4 double4;
+#define convert_double4 convert_float4
+#endif
+//#pragma OPENCL EXTENSION cl_amd_printf:enable
+//#if defined (DOUBLE_SUPPORT)
+__kernel void icvContourMoments(int contour_total,
+                                __global float* reader_oclmat_data,
+                                __global double* dst_a00,
+                                __global double* dst_a10,
+                                __global double* dst_a01,
+                                __global double* dst_a20,
+                                __global double* dst_a11,
+                                __global double* dst_a02,
+                                __global double* dst_a30,
+                                __global double* dst_a21,
+                                __global double* dst_a12,
+                                __global double* dst_a03)
+{
+    double xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
+    int idx = get_global_id(0);
+
+    xi_1 = *(reader_oclmat_data + (get_global_id(0) << 1));
+    yi_1 = *(reader_oclmat_data + (get_global_id(0) << 1) + 1);
+    xi_12 = xi_1 * xi_1;
+    yi_12 = yi_1 * yi_1;
+
+    if(idx == contour_total - 1)
+    {
+        xi = *(reader_oclmat_data);
+        yi = *(reader_oclmat_data + 1);
+    }
+    else
+    {
+        xi = *(reader_oclmat_data + (idx + 1) * 2);
+        yi = *(reader_oclmat_data + (idx + 1) * 2 + 1);
+    }
+
+    xi2 = xi * xi;
+    yi2 = yi * yi;
+    dxy = xi_1 * yi - xi * yi_1;
+    xii_1 = xi_1 + xi;
+    yii_1 = yi_1 + yi;
+
+    dst_a00[idx] = dxy;
+    dst_a10[idx] = dxy * xii_1;
+    dst_a01[idx] = dxy * yii_1;
+    dst_a20[idx] = dxy * (xi_1 * xii_1 + xi2);
+    dst_a11[idx] = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
+    dst_a02[idx] = dxy * (yi_1 * yii_1 + yi2);
+    dst_a30[idx] = dxy * xii_1 * (xi_12 + xi2);
+    dst_a03[idx] = dxy * yii_1 * (yi_12 + yi2);
+    dst_a21[idx] =
+        dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
+               xi2 * (yi_1 + 3 * yi));
+    dst_a12[idx] =
+        dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
+               yi2 * (xi_1 + 3 * xi));
+}
+//#endif
+
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                           __global double* dst_m00,
+                           __global double* dst_m10,
+                           __global double* dst_m01,
+                           __global double* dst_m20,
+                           __global double* dst_m11,
+                           __global double* dst_m02,
+                           __global double* dst_m30,
+                           __global double* dst_m21,
+                           __global double* dst_m12,
+                           __global double* dst_m03,
+                           int dst_cols, int dst_step, int type, int depth, int cn, int coi, int binary, int TILE_SIZE)
+{
+    uchar tmp_coi[16]; // get the coi data
+    uchar16 tmp[16];
+    int VLEN_C = 16;  // vector length of uchar
+
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE; // vector length of uchar
+    int x = wgidx*TILE_SIZE;  // vector length of uchar
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE, src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if( tileSize_width < TILE_SIZE )
+        for(int i = tileSize_width; i < rstep; i++ )
+            *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
+    if( coi > 0 )   //channel of interest
+        for(int i = 0; i < tileSize_width; i += VLEN_C)
+        {
+            for(int j=0; j<VLEN_C; j++)
+                tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
+                                      tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_C)
+            tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
+    uchar16 zero = (uchar16)(0);
+    uchar16 full = (uchar16)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_C)
+            tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
+    double mom[10];
+    __local int m[10][128];
+    if(lidy == 0)
+        for(int i=0; i<10; i++)
+            for(int j=0; j<128; j++)
+                m[i][j]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int lm[10] = {0};
+    int16 x0 = (int16)(0);
+    int16 x1 = (int16)(0);
+    int16 x2 = (int16)(0);
+    int16 x3 = (int16)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
+    {
+        int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
+        int16 p = convert_int16(tmp[xt/VLEN_C]);
+        int16 xp = v_xt * p, xxp = xp *v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp * v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
+    int py = lidy * ((int)x0.s0);
+    int sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((int)py) * sy;  // m03
+        m[8][lidy-bheight] = ((int)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((int)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((int)py) * sy;  // m03
+        lm[8] = ((int)x1.s0) * sy;  // m12
+        lm[7] = ((int)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = bheight; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+        double xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx]  = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx]  = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx]  = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]  = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx]  = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, __global double* sum, __global double* dst_m00,
+                      __global double* dst_m10,
+                      __global double* dst_m01,
+                      __global double* dst_m20,
+                      __global double* dst_m11,
+                      __global double* dst_m02,
+                      __global double* dst_m30,
+                      __global double* dst_m21,
+                      __global double* dst_m12,
+                      __global double* dst_m03)
+{
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int block_y = src_rows/tile_height;
+    int block_x = src_cols/tile_width;
+    int block_num;
+
+    if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
+        block_y ++;
+    if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
+        block_x ++;
+    block_num = block_y * block_x;
+    __local double dst_sum[10][128];
+    if(gidy<128-block_num)
+        for(int i=0; i<10; i++)
+            dst_sum[i][gidy+block_num]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(gidy<block_num)
+    {
+        dst_sum[0][gidy] = dst_m00[gidy];
+        dst_sum[1][gidy] = dst_m10[gidy];
+        dst_sum[2][gidy] = dst_m01[gidy];
+        dst_sum[3][gidy] = dst_m20[gidy];
+        dst_sum[4][gidy] = dst_m11[gidy];
+        dst_sum[5][gidy] = dst_m02[gidy];
+        dst_sum[6][gidy] = dst_m30[gidy];
+        dst_sum[7][gidy] = dst_m21[gidy];
+        dst_sum[8][gidy] = dst_m12[gidy];
+        dst_sum[9][gidy] = dst_m03[gidy];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int lsize=64; lsize>0; lsize>>=1)
+    {
+        if(gidy<lsize)
+        {
+            int lsize2 = gidy + lsize;
+            for(int i=0; i<10; i++)
+                dst_sum[i][gidy] += dst_sum[i][lsize2];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(gidy==0)
+        for(int i=0; i<10; i++)
+            sum[i] = dst_sum[i][0];
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                           __global double* dst_m00,
+                           __global double* dst_m10,
+                           __global double* dst_m01,
+                           __global double* dst_m20,
+                           __global double* dst_m11,
+                           __global double* dst_m02,
+                           __global double* dst_m30,
+                           __global double* dst_m21,
+                           __global double* dst_m12,
+                           __global double* dst_m03,
+                           int dst_cols, int dst_step,
+                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    ushort tmp_coi[8]; // get the coi data
+    ushort8 tmp[32];
+    int VLEN_US = 8; // vector length of ushort
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/2, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE, src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols -x);
+    if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
+        for(int i=tileSize_width; i < rstep; i++ )
+            *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+    if( coi > 0 )
+        for(int i=0; i < tileSize_width; i+=VLEN_US)
+        {
+            for(int j=0; j<VLEN_US; j++)
+                tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_US)
+            tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
+    ushort8 zero = (ushort8)(0);
+    ushort8 full = (ushort8)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_US)
+            tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
+    double mom[10];
+    __local long m[10][128];
+    if(lidy == 0)
+        for(int i=0; i<10; i++)
+            for(int j=0; j<128; j++)
+                m[i][j]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    long lm[10] = {0};
+    int8 x0 = (int8)(0);
+    int8 x1 = (int8)(0);
+    int8 x2 = (int8)(0);
+    long8 x3 = (long8)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
+    {
+        int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
+        int8 p = convert_int8(tmp[xt/VLEN_US]);
+        int8 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += convert_long8(xxp) *convert_long8(v_xt);
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
+
+    int py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((long)py) * sy;  // m03
+        m[8][lidy-bheight] = ((long)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((long)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((long)py) * sy;  // m03
+        lm[8] = ((long)x1.s0) * sy;  // m12
+        lm[7] = ((long)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for(int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        double xm = x  *mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx]  = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx]  = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx]  = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]  = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx]  = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                           __global double* dst_m00,
+                           __global double* dst_m10,
+                           __global double* dst_m01,
+                           __global double* dst_m20,
+                           __global double* dst_m11,
+                           __global double* dst_m02,
+                           __global double* dst_m30,
+                           __global double* dst_m21,
+                           __global double* dst_m12,
+                           __global double* dst_m03,
+                           int dst_cols, int dst_step,
+                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    short tmp_coi[8]; // get the coi data
+    short8 tmp[32];
+    int VLEN_S =8; // vector length of short
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/2, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE, src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols -x);
+    if(tileSize_width < TILE_SIZE)
+        for(int i = tileSize_width; i < rstep; i++ )
+            *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+    if( coi > 0 )
+        for(int i=0; i < tileSize_width; i+=VLEN_S)
+        {
+            for(int j=0; j<VLEN_S; j++)
+                tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_S)
+            tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
+    short8 zero = (short8)(0);
+    short8 full = (short8)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=(VLEN_S))
+            tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
+
+    double mom[10];
+    __local long m[10][128];
+    if(lidy == 0)
+        for(int i=0; i<10; i++)
+            for(int j=0; j<128; j++)
+                m[i][j]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    long lm[10] = {0};
+    int8 x0 = (int8)(0);
+    int8 x1 = (int8)(0);
+    int8 x2 = (int8)(0);
+    long8 x3 = (long8)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
+    {
+        int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
+        int8 p = convert_int8(tmp[xt/VLEN_S]);
+        int8 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += convert_long8(xxp) * convert_long8(v_xt);
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
+
+    int py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((long)py) * sy;  // m03
+        m[8][lidy-bheight] = ((long)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((long)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((long)py) * sy;  // m03
+        lm[8] = ((long)(x1.s0)) * sy;  // m12
+        lm[7] = ((long)(x2.s0)) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >=1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy ==0 &&lidx ==0)
+    {
+        for(int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        double xm = x * mom[0], ym = y*mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx]  = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx]  = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx]  = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx]   = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]   = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx]  = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx]  = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                            __global double* dst_m00,
+                            __global double* dst_m10,
+                            __global double* dst_m01,
+                            __global double* dst_m20,
+                            __global double* dst_m11,
+                            __global double* dst_m02,
+                            __global double* dst_m30,
+                            __global double* dst_m21,
+                            __global double* dst_m12,
+                            __global double* dst_m03,
+                            int dst_cols, int dst_step,
+                            int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    float tmp_coi[4]; // get the coi data
+    float4 tmp[64] ;
+    int VLEN_F = 4; // vector length of float
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/4, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE, src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols -x);
+    if(tileSize_width < TILE_SIZE)
+        for(int i = tileSize_width; i < rstep; i++ )
+            *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
+    if( coi > 0 )
+        for(int i=0; i < tileSize_width; i+=VLEN_F)
+        {
+            for(int j=0; j<4; j++)
+                tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_F)
+            tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
+    float4 zero = (float4)(0);
+    float4 full = (float4)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=4)
+            tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
+    double mom[10];
+    __local double m[10][128];
+    if(lidy == 0)
+        for(int i = 0; i < 10; i ++)
+            for(int j = 0; j < 128; j ++)
+                m[i][j] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    double lm[10] = {0};
+    double4 x0 = (double4)(0);
+    double4 x1 = (double4)(0);
+    double4 x2 = (double4)(0);
+    double4 x3 = (double4)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
+    {
+        double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
+        double4 p = convert_double4(tmp[xt/VLEN_F]);
+        double4 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp * v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3;
+/*
+    double py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((double)py) * sy;  // m03
+        m[8][lidy-bheight] = ((double)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((double)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((double)py) * sy;  // m03
+        lm[8] = ((double)x1.s0) * sy;  // m12
+        lm[7] = ((double)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for(int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        double xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx]= mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]= mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]= mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx]= mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }*/
+}
+//#endif
+//#if defined (DOUBLE_SUPPORT)
+__kernel void CvMoments_D6(__global double* src_data,  int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height,
+                           __global double* dst_m00,
+                           __global double* dst_m10,
+                           __global double* dst_m01,
+                           __global double* dst_m20,
+                           __global double* dst_m11,
+                           __global double* dst_m02,
+                           __global double* dst_m30,
+                           __global double* dst_m21,
+                           __global double* dst_m12,
+                           __global double* dst_m03,
+                           int dst_cols, int dst_step,
+                           int type, int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    double tmp_coi[4]; // get the coi data
+    double4 tmp[64];
+    int VLEN_D = 4; // length of vetor
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/8, TILE_SIZE);
+    tileSize_height = min(TILE_SIZE,  src_rows - y);
+    tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if(tileSize_width < TILE_SIZE)
+        for(int i = tileSize_width; i < rstep; i++ )
+            *((__global double*)src_data+(y+lidy)*src_step/8+x+i) = 0;
+    if( coi > 0 )
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+        {
+            for(int j=0; j<4; j++)
+                tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
+            tmp[i/VLEN_D] = (double4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+        }
+    else
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+            tmp[i/VLEN_D] = (double4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
+    double4 zero = (double4)(0);
+    double4 full = (double4)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
+    double mom[10];
+    __local double m[10][128];
+    if(lidy == 0)
+        for(int i=0; i<10; i++)
+            for(int j=0; j<128; j++)
+                m[i][j]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    double lm[10] = {0};
+    double4 x0 = (double4)(0);
+    double4 x1 = (double4)(0);
+    double4 x2 = (double4)(0);
+    double4 x3 = (double4)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
+    {
+        double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3);
+        double4 p = tmp[xt/VLEN_D];
+        double4 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp *v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3;
+
+    double py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((double)py) * sy;  // m03
+        m[8][lidy-bheight] = ((double)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((double)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+
+    else if(lidy < bheight)
+    {
+        lm[9] = ((double)py) * sy;  // m03
+        lm[8] = ((double)x1.s0) * sy;  // m12
+        lm[7] = ((double)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (double)lm[mt];
+        if(binary)
+        {
+            double s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        double xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        dst_m00[wgidy*dst_cols+wgidx] = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        dst_m20[wgidy*dst_cols+wgidx]  = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        dst_m11[wgidy*dst_cols+wgidx]  = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        dst_m02[wgidy*dst_cols+wgidx]  = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        dst_m30[wgidy*dst_cols+wgidx]  = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        dst_m21[wgidy*dst_cols+wgidx]  = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        dst_m12[wgidy*dst_cols+wgidx]  = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        dst_m03[wgidy*dst_cols+wgidx]  = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+//#endif
--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/ocl/src/kernels/nonfree_surf.cl
@@ -43,10 +43,39 @@
 //
 //M*/

-#pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable

+// specialized for non-image2d_t supported platform, intel HD4000, for example
+#ifdef DISABLE_IMAGE2D
+#define IMAGE_INT32 __global uint  *
+#define IMAGE_INT8  __global uchar *
+#else
+#define IMAGE_INT32 image2d_t
+#define IMAGE_INT8  image2d_t
+#endif
+
+uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow)
+{
+#ifdef DISABLE_IMAGE2D
+    int x = clamp(coord.x, 0, cols);
+    int y = clamp(coord.y, 0, rows);
+    return img[elemPerRow * y + x];
+#else
+    return read_imageui(img, sam, coord).x;
+#endif
+}
+uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
+{
+#ifdef DISABLE_IMAGE2D
+    int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
+    int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
+    return img[elemPerRow * y + x];
+#else
+    return (uchar)read_imageui(img, sam, coord).x;
+#endif
+}
+
 // dynamically change the precision used for floating type

 #if defined (__ATI__) || defined (__NVIDIA__)
@@ -58,14 +87,24 @@
 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

+#ifndef FLT_EPSILON
 #define FLT_EPSILON (1e-15)
-#define CV_PI_F 3.14159265f
+#endif

+#ifndef CV_PI_F
+#define CV_PI_F 3.14159265f
+#endif

 // Use integral image to calculate haar wavelets.
 // N = 2
 // for simple haar paatern
-float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int oldSize, int newSize, int y, int x)
+float icvCalcHaarPatternSum_2(
+    IMAGE_INT32 sumTex,
+    __constant float src[2][5],
+    int oldSize,
+    int newSize,
+    int y, int x,
+    int rows, int cols, int elemPerRow)
 {

    float ratio = (float)newSize / oldSize;
@@ -81,11 +120,10 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
        int dy2 = convert_int_rte(ratio * src[k][3]);

        F t = 0;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
-
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
    }

@@ -93,7 +131,13 @@ float icvCalcHaarPatternSum_2(image2d_t sumTex, __constant float src[2][5], int
 }

 // N = 3
-float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int oldSize, int newSize, int y, int x)
+float icvCalcHaarPatternSum_3(
+    IMAGE_INT32 sumTex,
+    __constant float src[2][5],
+    int oldSize,
+    int newSize,
+    int y, int x,
+    int rows, int cols, int elemPerRow)
 {

    float ratio = (float)newSize / oldSize;
@@ -109,11 +153,10 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
        int dy2 = convert_int_rte(ratio * src[k][3]);

        F t = 0;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
-
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
    }

@@ -121,7 +164,13 @@ float icvCalcHaarPatternSum_3(image2d_t sumTex, __constant float src[3][5], int
 }

 // N = 4
-float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int oldSize, int newSize, int y, int x)
+float icvCalcHaarPatternSum_4(
+    IMAGE_INT32 sumTex,
+    __constant float src[2][5],
+    int oldSize,
+    int newSize,
+    int y, int x,
+    int rows, int cols, int elemPerRow)
 {

    float ratio = (float)newSize / oldSize;
@@ -137,11 +186,10 @@ float icvCalcHaarPatternSum_4(image2d_t sumTex, __constant float src[4][5], int
        int dy2 = convert_int_rte(ratio * src[k][3]);

        F t = 0;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy1)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx1, y + dy2)).x;
-        t -= read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy1)).x;
-        t += read_imageui(sumTex, sampler, (int2)(x + dx2, y + dy2)).x;
-
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy1), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx1, y + dy2), rows, cols, elemPerRow );
+        t -= read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy1), rows, cols, elemPerRow );
+        t += read_sumTex( sumTex, sampler, (int2)(x + dx2, y + dy2), rows, cols, elemPerRow );
        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
    }

@@ -172,7 +220,7 @@ __inline int calcSize(int octave, int layer)

 //calculate targeted layer per-pixel determinant and trace with an integral image
 __kernel void icvCalcLayerDetAndTrace(
-    image2d_t sumTex, // input integral image
+    IMAGE_INT32 sumTex, // input integral image
    __global float * det,      // output Determinant
    __global float * trace,    // output trace
    int det_step,     // the step of det in bytes
@@ -181,11 +229,13 @@ __kernel void icvCalcLayerDetAndTrace(
    int c_img_cols,
    int c_nOctaveLayers,
    int c_octave,
-    int c_layer_rows
+    int c_layer_rows,
+    int sumTex_step
    )
 {
    det_step   /= sizeof(*det);
    trace_step /= sizeof(*trace);
+    sumTex_step/= sizeof(uint);
    // Determine the indices
    const int gridDim_y  = get_num_groups(1) / (c_nOctaveLayers + 2);
    const int blockIdx_y = get_group_id(1) % gridDim_y;
@@ -205,9 +255,9 @@ __kernel void icvCalcLayerDetAndTrace(

    if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
    {
-        const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave);
-        const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave);
-        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave);
+        const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);

        det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
        trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
@@ -220,7 +270,7 @@ __kernel void icvCalcLayerDetAndTrace(

 __constant float c_DM[5] = {0, 0, 9, 9, 1};

-bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)
+bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
 {
    float ratio = (float)size / 9.0f;

@@ -233,10 +283,10 @@ bool within_check(image2d_t maskSumTex, int sum_i, int sum_j, int size)

    float t = 0;

-    t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1)).x;
-    t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2)).x;
-    t -= read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1)).x;
-    t += read_imageui(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2)).x;
+    t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
+    t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
+    t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
+    t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);

    d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));

@@ -261,7 +311,8 @@ __kernel
    int c_layer_cols,
    int c_max_candidates,
    float c_hessianThreshold,
-    image2d_t maskSumTex
+    IMAGE_INT32 maskSumTex,
+    int mask_step
    )
 {
    volatile __local  float N9[768]; // threads.x * threads.y * 3
@@ -269,6 +320,7 @@ __kernel
    det_step   /= sizeof(*det);
    trace_step /= sizeof(*trace);
    maxCounter += counter_offset;
+    mask_step  /= sizeof(uint);

    // Determine the indices
    const int gridDim_y  = get_num_groups(1) / c_nOctaveLayers;
@@ -321,7 +373,7 @@ __kernel
            const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
            const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;

-            if (within_check(maskSumTex, sum_i, sum_j, size))
+            if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
            {
                // Check to see if we have a max (in its 26 neighbours)
                const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
@@ -704,14 +756,16 @@ void reduce_32_sum(volatile __local  float * data, float partial_reduction, int

 __kernel
    void icvCalcOrientation(
-    image2d_t sumTex,
+    IMAGE_INT32 sumTex,
    __global float * keypoints,
    int keypoints_step,
    int c_img_rows,
-    int c_img_cols
+    int c_img_cols,
+    int sum_step
    )
 {
    keypoints_step /= sizeof(*keypoints);
+    sum_step       /= sizeof(uint);
    __global float* featureX    = keypoints + X_ROW * keypoints_step;
    __global float* featureY    = keypoints + Y_ROW * keypoints_step;
    __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
@@ -754,8 +808,8 @@ __kernel
        if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
            x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
        {
-            X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x);
-            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x);
+            X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
+            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);

            angle = atan2(Y, X);

@@ -881,20 +935,20 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =

 // utility for linear filter
 inline uchar readerGet(
-    image2d_t src,
+    IMAGE_INT8 src,
    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
-    int i, int j
+    int i, int j, int rows, int cols, int elemPerRow
    )
 {
    float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
    float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
-    return (uchar)read_imageui(src, sampler, (float2)(pixel_x, pixel_y)).x;
+    return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
 }

 inline float linearFilter(
-    image2d_t src,
+    IMAGE_INT8 src,
    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
-    float y, float x
+    float y, float x, int rows, int cols, int elemPerRow
    )
 {
    x -= 0.5f;
@@ -907,30 +961,33 @@ inline float linearFilter(
    const int x2 = x1 + 1;
    const int y2 = y1 + 1;

-    uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
+    uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow);
    out = out + src_reg * ((x2 - x) * (y2 - y));

-    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
+    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow);
    out = out + src_reg * ((x - x1) * (y2 - y));

-    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
+    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow);
    out = out + src_reg * ((x2 - x) * (y - y1));

-    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
+    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow);
    out = out + src_reg * ((x - x1) * (y - y1));

    return out;
 }

 void calc_dx_dy(
-    image2d_t imgTex,
+    IMAGE_INT8 imgTex,
    volatile __local  float s_dx_bin[25],
    volatile __local  float s_dy_bin[25],
    volatile __local  float s_PATCH[6][6],
    __global const float* featureX,
    __global const float* featureY,
    __global const float* featureSize,
-    __global const float* featureDir
+    __global const float* featureDir,
+    int rows,
+    int cols,
+    int elemPerRow
    )
 {
    const float centerX = featureX[get_group_id(0)];
@@ -965,7 +1022,7 @@ void calc_dx_dy(
    const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
    const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;

-    s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo);
+    s_PATCH[get_local_id(1)][get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);

    barrier(CLK_LOCAL_MEM_FENCE);

@@ -1035,16 +1092,18 @@ void reduce_sum25(

 __kernel
    void compute_descriptors64(
-    image2d_t imgTex,
+    IMAGE_INT8 imgTex,
    volatile __global float * descriptors,
    __global const float * keypoints,
    int descriptors_step,
-    int keypoints_step
+    int keypoints_step,
+    int rows,
+    int cols,
+    int img_step
    )
 {
    descriptors_step /= sizeof(float);
    keypoints_step   /= sizeof(float);
-
    __global const float * featureX    = keypoints + X_ROW * keypoints_step;
    __global const float * featureY    = keypoints + Y_ROW * keypoints_step;
    __global const float * featureSize = keypoints + SIZE_ROW * keypoints_step;
@@ -1057,7 +1116,7 @@ __kernel
    volatile __local  float sdyabs[25];
    volatile __local  float s_PATCH[6][6];

-    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
+    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
    barrier(CLK_LOCAL_MEM_FENCE);

    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@@ -1066,10 +1125,10 @@ __kernel
    {
        sdxabs[tid] = fabs(sdx[tid]); // |dx| array
        sdyabs[tid] = fabs(sdy[tid]); // |dy| array
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);

@@ -1085,11 +1144,14 @@ __kernel
 }
 __kernel
    void compute_descriptors128(
-    image2d_t imgTex,
+    IMAGE_INT8 imgTex,
    __global volatile float * descriptors,
    __global float * keypoints,
    int descriptors_step,
-    int keypoints_step
+    int keypoints_step,
+    int rows,
+    int cols,
+    int img_step
    )
 {
    descriptors_step /= sizeof(*descriptors);
@@ -1111,7 +1173,7 @@ __kernel
    volatile __local  float sdabs2[25];
    volatile __local  float s_PATCH[6][6];

-    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
+    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
    barrier(CLK_LOCAL_MEM_FENCE);

    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@@ -1132,10 +1194,10 @@ __kernel
            sd2[tid] = sdx[tid];
            sdabs2[tid] = fabs(sdx[tid]);
        }
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);

@@ -1162,10 +1224,10 @@ __kernel
            sd2[tid] = sdy[tid];
            sdabs2[tid] = fabs(sdy[tid]);
        }
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        //barrier(CLK_LOCAL_MEM_FENCE);

        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
        if (tid == 0)
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -68,6 +68,8 @@ namespace cv
        extern const char *operator_setTo;
        extern const char *operator_setToM;
        extern const char *convertC3C4;
+        extern DevMemType gDeviceMemType;
+        extern DevMemRW gDeviceMemRW;
    }
 }

@@ -911,7 +913,17 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const

 }

+void cv::ocl::oclMat::createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type)
+{
+    createEx(size.height, size.width, type, rw_type, mem_type);
+}
+
 void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
+{
+    createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
+}
+
+void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, DevMemRW rw_type, DevMemType mem_type)
 {
    clCxt = Context::getContext();
    /* core logic */
@@ -936,7 +948,7 @@ void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
        size_t esz = elemSize();

        void *dev_ptr;
-        openCLMallocPitch(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows);
+        openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type);
        //openCLMallocPitch(clCxt,&dev_ptr, &step, esz * cols, rows);

        if (esz * cols == step)
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -217,6 +217,36 @@ namespace cv
        {
            openCLFree(texture);
        }
+
+        bool support_image2d(Context *clCxt)
+        {
+            static const char * _kernel_string = "__kernel void test_func(image2d_t img) {}";
+            static bool _isTested = false;
+            static bool _support = false;
+            if(_isTested)
+            {
+                return _support;
+            }
+            try
+            {
+                cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
+                _support = true;
+            }
+            catch (const cv::Exception& e)
+            {
+                if(e.code == -217)
+                {
+                    _support = false;
+                }
+                else
+                {
+                    // throw e once again
+                    throw e;
+                }
+            }
+            _isTested = true;
+            return _support;
+        }
    }//namespace ocl

 }//namespace cv
--- a/modules/ocl/src/mcwutil.hpp
+++ b/modules/ocl/src/mcwutil.hpp
@@ -69,6 +69,10 @@ namespace cv
        //   2. for faster clamping, there is no buffer padding for the constructed texture
        cl_mem bindTexture(const oclMat &mat);
        void releaseTexture(cl_mem& texture);
+
+        // returns whether the current context supports image2d_t format or not
+        bool support_image2d(Context *clCxt = Context::getContext());
+
    }//namespace ocl

 }//namespace cv
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@@ -0,0 +1,379 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Sen Liu, sen@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+#include <iostream>
+namespace cv
+{
+namespace ocl
+{
+extern const char *moments;
+
+// The function calculates center of gravity and the central second order moments
+static void icvCompleteMomentState( CvMoments* moments )
+{
+    double cx = 0, cy = 0;
+    double mu20, mu11, mu02;
+
+    assert( moments != 0 );
+    moments->inv_sqrt_m00 = 0;
+
+    if( fabs(moments->m00) > DBL_EPSILON )
+    {
+        double inv_m00 = 1. / moments->m00;
+        cx = moments->m10 * inv_m00;
+        cy = moments->m01 * inv_m00;
+        moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
+    }
+
+    // mu20 = m20 - m10*cx
+    mu20 = moments->m20 - moments->m10 * cx;
+    // mu11 = m11 - m10*cy
+    mu11 = moments->m11 - moments->m10 * cy;
+    // mu02 = m02 - m01*cy
+    mu02 = moments->m02 - moments->m01 * cy;
+
+    moments->mu20 = mu20;
+    moments->mu11 = mu11;
+    moments->mu02 = mu02;
+
+    // mu30 = m30 - cx*(3*mu20 + cx*m10)
+    moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
+    mu11 += mu11;
+    // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
+    moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
+    // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
+    moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
+    // mu03 = m03 - cy*(3*mu02 + cy*m01)
+    moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
+}
+
+
+static void icvContourMoments( CvSeq* contour, CvMoments* mom )
+{
+    if( contour->total )
+    {
+        CvSeqReader reader;
+        int lpt = contour->total;
+        double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
+        int dst_type = cv::ocl::Context::getContext()->impl->double_support ? CV_64FC1 : CV_32FC1;
+
+        cvStartReadSeq( contour, &reader, 0 );
+
+        cv::ocl::oclMat dst_a00(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a10(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a01(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a20(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a11(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a02(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a30(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a21(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a12(1,lpt,dst_type);
+        cv::ocl::oclMat dst_a03(1,lpt,dst_type);
+        size_t reader_size = lpt << 1;
+        cv::Mat reader_mat(1,reader_size,CV_32FC1);
+
+        bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
+
+        if( is_float )
+        {
+            for(size_t i = 0; i < reader_size; ++i)
+            {
+                reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
+                reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
+                CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+            }
+        }
+        else
+        {
+            for(size_t i = 0; i < reader_size; ++i)
+            {
+                reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
+                reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
+                CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
+            }
+        }
+
+        cv::ocl::oclMat reader_oclmat(reader_mat);
+        int llength = std::min(lpt,128);
+        size_t localThreads[3]  = { llength, 1, 1};
+        size_t globalThreads[3] = { lpt, 1, 1};
+        std::vector<std::pair<size_t , const void *> > args;
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a00.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a10.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a01.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a20.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a11.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a02.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a30.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a21.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a12.data ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a03.data ));
+        openCLExecuteKernel(dst_a00.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
+
+        cv::Mat dst(dst_a00);
+        cv::Scalar s = cv::sum(dst);
+        a00 = s[0];
+        dst = dst_a10;
+        s = cv::sum(dst);
+        a10 = s[0];//dstsum[1];
+        dst = dst_a01;
+        s = cv::sum(dst);
+        a01 = s[0];//dstsum[2];
+        dst = dst_a20;
+        s = cv::sum(dst);
+        a20 = s[0];//dstsum[3];
+        dst = dst_a11;
+        s = cv::sum(dst);
+        a11 = s[0];//dstsum[4];
+        dst = dst_a02;
+        s = cv::sum(dst);
+        a02 = s[0];//dstsum[5];
+        dst = dst_a30;
+        s = cv::sum(dst);
+        a30 = s[0];//dstsum[6];
+        dst = dst_a21;
+        s = cv::sum(dst);
+        a21 = s[0];//dstsum[7];
+        dst = dst_a12;
+        s = cv::sum(dst);
+        a12 = s[0];//dstsum[8];
+        dst = dst_a03;
+        s = cv::sum(dst);
+        a03 = s[0];//dstsum[9];
+
+        double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
+        if( fabs(a00) > FLT_EPSILON )
+        {
+            if( a00 > 0 )
+            {
+                db1_2 = 0.5;
+                db1_6 = 0.16666666666666666666666666666667;
+                db1_12 = 0.083333333333333333333333333333333;
+                db1_24 = 0.041666666666666666666666666666667;
+                db1_20 = 0.05;
+                db1_60 = 0.016666666666666666666666666666667;
+            }
+            else
+            {
+                db1_2 = -0.5;
+                db1_6 = -0.16666666666666666666666666666667;
+                db1_12 = -0.083333333333333333333333333333333;
+                db1_24 = -0.041666666666666666666666666666667;
+                db1_20 = -0.05;
+                db1_60 = -0.016666666666666666666666666666667;
+            }
+
+            // spatial moments
+            mom->m00 = a00 * db1_2;
+            mom->m10 = a10 * db1_6;
+            mom->m01 = a01 * db1_6;
+            mom->m20 = a20 * db1_12;
+            mom->m11 = a11 * db1_24;
+            mom->m02 = a02 * db1_12;
+            mom->m30 = a30 * db1_20;
+            mom->m21 = a21 * db1_60;
+            mom->m12 = a12 * db1_60;
+            mom->m03 = a03 * db1_20;
+
+            icvCompleteMomentState( mom );
+        }
+    }
+}
+
+static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
+{
+    const int TILE_SIZE = 256;
+    int type, depth, cn, coi = 0;
+    CvMat stub, *mat = (CvMat*)array;
+    CvContour contourHeader;
+    CvSeq* contour = 0;
+    CvSeqBlock block;
+    if( CV_IS_SEQ( array ))
+    {
+        contour = (CvSeq*)array;
+        if( !CV_IS_SEQ_POINT_SET( contour ))
+            CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
+    }
+
+    if( !moments )
+        CV_Error( CV_StsNullPtr, "" );
+
+    memset( mom, 0, sizeof(*mom));
+
+    if( !contour )
+    {
+
+        mat = cvGetMat( mat, &stub, &coi );
+        type = CV_MAT_TYPE( mat->type );
+
+        if( type == CV_32SC2 || type == CV_32FC2 )
+        {
+            contour = cvPointSeqFromMat(
+                          CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
+                          mat, &contourHeader, &block );
+        }
+    }
+    if( contour )
+    {
+        icvContourMoments( contour, mom );
+        return;
+    }
+
+    type = CV_MAT_TYPE( mat->type );
+    depth = CV_MAT_DEPTH( type );
+    cn = CV_MAT_CN( type );
+
+    cv::Size size = cvGetMatSize( mat );
+    if( cn > 1 && coi == 0 )
+        CV_Error( CV_StsBadArg, "Invalid image type" );
+
+    if( size.width <= 0 || size.height <= 0 )
+        return;
+
+    cv::Mat src0(mat);
+    cv::ocl::oclMat src(src0);
+    cv::Size tileSize;
+    int blockx,blocky;
+    if(size.width%TILE_SIZE == 0)
+        blockx = size.width/TILE_SIZE;
+    else
+        blockx = size.width/TILE_SIZE + 1;
+    if(size.height%TILE_SIZE == 0)
+        blocky = size.height/TILE_SIZE;
+    else
+        blocky = size.height/TILE_SIZE + 1;
+    cv::ocl::oclMat dst_m00(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m10(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m01(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m20(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m11(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m02(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m30(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m21(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m12(blocky, blockx, CV_64FC1);
+    cv::ocl::oclMat dst_m03(blocky, blockx, CV_64FC1);
+    cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double));
+    int tile_width  = std::min(size.width,TILE_SIZE);
+    int tile_height = std::min(size.height,TILE_SIZE);
+    size_t localThreads[3]  = { tile_height, 1, 1};
+    size_t globalThreads[3] = { size.height, blockx, 1};
+    std::vector<std::pair<size_t , const void *> > args,args_sum;
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m00.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m00.step ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&type ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&depth ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cn ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&coi ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
+    openCLExecuteKernel(dst_m00.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
+
+    size_t localThreadss[3]  = { 128, 1, 1};
+    size_t globalThreadss[3] = { 128, 1, 1};
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_height ));
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_width ));
+    args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m00.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m10.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m01.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m20.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m11.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m02.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m30.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m21.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m12.data ));
+    args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m03.data ));
+    openCLExecuteKernel(dst_m00.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
+    double* dstsum = new double[10];
+    memset(dstsum,0,10*sizeof(double));
+    openCLReadBuffer(dst_m00.clCxt,sum,(void *)dstsum,10*sizeof(double));
+    mom->m00 = dstsum[0];
+    mom->m10 = dstsum[1];
+    mom->m01 = dstsum[2];
+    mom->m20 = dstsum[3];
+    mom->m11 = dstsum[4];
+    mom->m02 = dstsum[5];
+    mom->m30 = dstsum[6];
+    mom->m21 = dstsum[7];
+    mom->m12 = dstsum[8];
+    mom->m03 = dstsum[9];
+
+    icvCompleteMomentState( mom );
+}
+
+Moments ocl_moments( InputArray _array, bool binaryImage )
+{
+    CvMoments om;
+    Mat arr = _array.getMat();
+    CvMat c_array = arr;
+    ocl_cvMoments(&c_array, &om, binaryImage);
+    return om;
+}
+
+}
+
+}
--- a/modules/ocl/src/precomp.hpp
+++ b/modules/ocl/src/precomp.hpp
@@ -93,6 +93,8 @@ namespace cv
        ///////////////////////////OpenCL call wrappers////////////////////////////
        void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
                               size_t widthInBytes, size_t height);
+        void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
+                               size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
        void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
                            const void *src, size_t spitch,
                            size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1);
@@ -141,6 +143,7 @@ namespace cv
            //extra options to recognize vendor specific fp64 extensions
            char extra_options[512];
            std::string Binpath;
+            int unified_memory; //1 means integrated GPU, otherwise this value is 0
        };
    }
 }
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@@ -573,8 +573,9 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    Context  *clCxt = I.clCxt;
    int elemCntPerRow = I.step / I.elemSize();
    std::string kernelName = "lkSparse";
-    size_t localThreads[3]  = { 8, 8, 1 };
-    size_t globalThreads[3] = { 8 * ptcount, 8, 1};
+    bool isImageSupported = support_image2d();
+    size_t localThreads[3]  = { 8, isImageSupported ? 8 : 32, 1 };
+    size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
    int cn = I.oclchannels();
    char calcErr;
    if (level == 0)
@@ -587,8 +588,9 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    }

    std::vector<std::pair<size_t , const void *> > args;
-    cl_mem ITex = bindTexture(I);
-    cl_mem JTex = bindTexture(J);
+
+    cl_mem ITex = isImageSupported ? bindTexture(I) : (cl_mem)I.data;
+    cl_mem JTex = isImageSupported ? bindTexture(J) : (cl_mem)J.data;

    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ITex ));
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&JTex ));
@@ -601,6 +603,8 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&level ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.rows ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.cols ));
+    if (!isImageSupported)
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.x ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.y ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cn ));
@@ -609,19 +613,14 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&iters ));
    args.push_back( std::make_pair( sizeof(cl_char), (void *)&calcErr ));

-    try
+    if(isImageSupported)
    {
        openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
-    }
-    catch(Exception&)
-    {
-        printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n");
        releaseTexture(ITex);
        releaseTexture(JTex);
-        ITex = (cl_mem)I.data;
-        JTex = (cl_mem)J.data;
-        localThreads[1] = globalThreads[1] = 32;
-        args.insert( args.begin()+11, std::make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) );
+    }
+    else
+    {
        openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
    }
 }
@@ -723,7 +722,7 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v,
                 oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters)
 {
    Context  *clCxt = I.clCxt;
-    bool isImageSupported = clCxt->impl->devName.find("Intel(R) HD Graphics") == std::string::npos;
+    bool isImageSupported = support_image2d();
    int elemCntPerRow = I.step / I.elemSize();

    std::string kernelName = "lkDense";
--- a/modules/ocl/src/surf.cpp
+++ b/modules/ocl/src/surf.cpp
@@ -1,4 +1,4 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
+/*M/////////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -44,6 +44,7 @@
 //M*/
 #include <iomanip>
 #include "precomp.hpp"
+#include "mcwutil.hpp"
 //#include "opencv2/highgui/highgui.hpp"

 using namespace cv;
@@ -70,7 +71,7 @@ static inline int calcSize(int octave, int layer)

    /* Wavelet size increment between layers. This should be an even number,
    such that the wavelet sizes in an octave are either all even or all odd.
-    This ensures that when looking for the neighbours of a sample, the layers
+    This ensures that when looking for the neighbors of a sample, the layers

    above and below are aligned correctly. */
    const int HAAR_SIZE_INC = 6;
@@ -78,6 +79,11 @@ static inline int calcSize(int octave, int layer)
    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 }

+namespace
+{
+    const char* noImage2dOption = "-D DISABLE_IMAGE2D";
+}
+
 class SURF_OCL_Invoker
 {
 public:
@@ -87,7 +93,7 @@ public:
    //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
    //void loadOctaveConstants(int octave, int layer_rows, int layer_cols);

-    // kernel callers declearations
+    // kernel callers declarations
    void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);

    void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
@@ -99,14 +105,14 @@ public:
    void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);

    void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
-    // end of kernel callers declearations
+    // end of kernel callers declarations


    SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
        surf_(surf),
        img_cols(img.cols), img_rows(img.rows),
-        use_mask(!mask.empty()),
-        imgTex(NULL), sumTex(NULL), maskSumTex(NULL)
+        use_mask(!mask.empty()), counters(oclMat()),
+        imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
    {
        CV_Assert(!img.empty() && img.type() == CV_8UC1);
        CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@@ -130,12 +136,13 @@ public:
        counters.create(1, surf_.nOctaves + 1, CV_32SC1);
        counters.setTo(Scalar::all(0));

-        //loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
-
+        integral(img, surf_.sum);
+        if(support_image2d())
+        {
        bindImgTex(img, imgTex);
-        integral(img, surf_.sum); // the two argumented integral version is incorrect
+            bindImgTex(surf_.sum, sumTex);
+        }

-        bindImgTex(surf_.sum, sumTex);
        maskSumTex = 0;

        if (use_mask)
@@ -154,7 +161,7 @@ public:
    void detectKeypoints(oclMat &keypoints)
    {
        // create image pyramid buffers
-        // different layers have same sized buffers, but they are sampled from gaussin kernel.
+        // different layers have same sized buffers, but they are sampled from Gaussian kernel.
        ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
        ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);

@@ -221,7 +228,6 @@ public:
            openCLFree(sumTex);
        if(maskSumTex)
            openCLFree(maskSumTex);
-        additioalParamBuffer.release();
    }

 private:
@@ -241,7 +247,7 @@ private:
    cl_mem sumTex;
    cl_mem maskSumTex;

-    oclMat additioalParamBuffer;
+    const oclMat _img; // make a copy for non-image2d_t supported platform

    SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
    {
@@ -361,11 +367,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
 {
    if (!img.empty())
    {
-        if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != std::string::npos)
-        {
-            std::cout << " Intel HD GPU device unsupported " << std::endl;
-            return;
-        }
        SURF_OCL_Invoker surf(*this, img, mask);

        surf.detectKeypoints(keypoints);
@@ -377,11 +378,6 @@ void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat
 {
    if (!img.empty())
    {
-        if (img.clCxt->impl->devName.find("Intel(R) HD Graphics") != std::string::npos)
-        {
-            std::cout << " Intel HD GPU device unsupported " << std::endl;
-            return;
-        }
        SURF_OCL_Invoker surf(*this, img, mask);

        if (!useProvidedKeypoints)
@@ -442,74 +438,11 @@ void cv::ocl::SURF_OCL::releaseMemory()
 // bind source buffer to image oject.
 void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
 {
-    cl_image_format format;
-    int err;
-    int depth    = img.depth();
-    int channels = img.channels();
-
-    switch(depth)
-    {
-    case CV_8U:
-        format.image_channel_data_type = CL_UNSIGNED_INT8;
-        break;
-    case CV_32S:
-        format.image_channel_data_type = CL_UNSIGNED_INT32;
-        break;
-    case CV_32F:
-        format.image_channel_data_type = CL_FLOAT;
-        break;
-    default:
-        throw std::exception();
-        break;
-    }
-    switch(channels)
-    {
-    case 1:
-        format.image_channel_order     = CL_R;
-        break;
-    case 3:
-        format.image_channel_order     = CL_RGB;
-        break;
-    case 4:
-        format.image_channel_order     = CL_RGBA;
-        break;
-    default:
-        throw std::exception();
-        break;
-    }
    if(texture)
    {
        openCLFree(texture);
    }
-
-#ifdef CL_VERSION_1_2
-    cl_image_desc desc;
-    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
-    desc.image_width      = img.step / img.elemSize();
-    desc.image_height     = img.rows;
-    desc.image_depth      = 0;
-    desc.image_array_size = 1;
-    desc.image_row_pitch  = 0;
-    desc.image_slice_pitch = 0;
-    desc.buffer           = NULL;
-    desc.num_mip_levels   = 0;
-    desc.num_samples      = 0;
-    texture = clCreateImage(Context::getContext()->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
-#else
-    texture = clCreateImage2D(
-                  Context::getContext()->impl->clContext,
-                  CL_MEM_READ_WRITE,
-                  &format,
-                  img.step / img.elemSize(),
-                  img.rows,
-                  0,
-                  NULL,
-                  &err);
-#endif
-    size_t origin[] = { 0, 0, 0 };
-    size_t region[] = { img.step / img.elemSize(), img.rows, 1 };
-    clEnqueueCopyBufferToImage(img.clCxt->impl->clCmdQueue, (cl_mem)img.data, texture, 0, origin, region, 0, NULL, 0);
-    openCLSafeCall(err);
+    texture = bindTexture(img);
 }

 ////////////////////////////
@@ -524,7 +457,14 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
    std::string kernelName = "icvCalcLayerDetAndTrace";
    std::vector< std::pair<size_t, const void *> > args;

-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
+    if(sumTex)
+    {
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
+    }
+    else
+    {
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
+    }
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
@@ -534,6 +474,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&c_layer_rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));

    size_t localThreads[3]  = {16, 16, 1};
    size_t globalThreads[3] =
@@ -542,8 +483,15 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
        divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
        1
    };
+    if(support_image2d())
+    {
    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
+    else
+    {
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+    }
+}

 void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
        int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
@@ -572,17 +520,31 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat

    if(use_mask)
    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex));
+        if(maskSumTex)
+        {
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex));
+        }
+        else
+        {
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
+        }
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
    }
-
    size_t localThreads[3]  = {16, 16, 1};
    size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
                               divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
                               1
                              };

+    if(support_image2d())
+    {
    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
+    else
+    {
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+    }
+}

 void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter,
        oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
@@ -606,8 +568,15 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
    size_t localThreads[3]  = {3, 3, 3};
    size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};

+    if(support_image2d())
+    {
    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
+    else
+    {
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+    }
+}

 void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
 {
@@ -616,17 +585,32 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat

    std::vector< std::pair<size_t, const void *> > args;

+    if(sumTex)
+    {
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
+    }
+    else
+    {
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
+    }
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));

    size_t localThreads[3]  = {32, 4, 1};
    size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};

+    if(support_image2d())
+    {
    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
+    else
+    {
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+    }
+}

 void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
 {
@@ -648,12 +632,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        globalThreads[1] = 16 * localThreads[1];

        args.clear();
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
+        if(imgTex)
+        {
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
+        }
+        else
+        {
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
+        }
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
+        if(support_image2d())
+        {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        }
+        else
+        {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+        }

        kernelName = "normalize_descriptors64";

@@ -666,9 +667,16 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        if(support_image2d())
+        {
        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
    }
    else
+    {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+        }
+    }
+    else
    {
        kernelName = "compute_descriptors128";

@@ -679,12 +687,29 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        globalThreads[1] = 16 * localThreads[1];

        args.clear();
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
+        if(imgTex)
+        {
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
+        }
+        else
+        {
+            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
+        }
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
+        if(support_image2d())
+        {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        }
+        else
+        {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+        }

        kernelName = "normalize_descriptors128";

@@ -697,7 +722,14 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        if(support_image2d())
+        {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        }
+        else
+        {
+            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
+        }
    }
 }