refactored and extended ocl::addWeighted

2013-09-24 13:51:37 +04:00
parent b20bd470fe
commit 161674bff2
2 changed files with 53 additions and 409 deletions
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -1795,64 +1795,66 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)

 void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
 {
-    dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols ==  src2.cols && src2.cols == dst.cols &&
-              src1.rows ==  src2.rows && src2.rows == dst.rows);
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-
    Context *clCxt = src1.clCxt;
+    bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+    if (!hasDouble && src1.depth() == CV_64F)
+    {
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
+        return;
+    }
+
+    CV_Assert(src1.size() ==  src2.size() && src1.type() == src2.type());
+    dst.create(src1.size(), src1.type());
+
    int channels = dst.oclchannels();
    int depth = dst.depth();

+    int cols1 = src1.cols * channels;
+    int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
+    int src2step1 = src2.step1(), src2offset1 = src2.offset / src1.elemSize1();
+    int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();

-    int vector_lengths[4][7] = {{4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4}
-    };
-
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    std::string buildOptions = format("-D T=%s -D WT=%s -D convertToT=convert_%s%s",
+                                      typeMap[depth], hasDouble ? "double" : "float", typeMap[depth],
+                                      depth >= CV_32F ? "" : "_sat_rte");

    size_t localThreads[3]  = { 256, 1, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1};
+    size_t globalThreads[3] = { cols1, dst.rows, 1};
+
+    float alpha_f = static_cast<float>(alpha),
+            beta_f = static_cast<float>(beta),
+            gama_f = static_cast<float>(gama);

-    int dst_step1 = dst.cols * dst.elemSize();
-    int src1_step = (int) src1.step;
-    int src2_step = (int) src2.step;
-    int dst_step  = (int) dst.step;
-    float alpha_f = alpha, beta_f = beta, gama_f = gama;
    vector<pair<size_t , const void *> > args;
    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1_step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1));

-    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
-    {
-        args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
-        args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
-        args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
-    }
-    else
+    if (!hasDouble)
    {
        args.push_back( make_pair( sizeof(cl_float), (void *)&alpha_f ));
        args.push_back( make_pair( sizeof(cl_float), (void *)&beta_f ));
        args.push_back( make_pair( sizeof(cl_float), (void *)&gama_f ));
    }
+    else
+    {
+        args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
+        args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
+        args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
+    }

-    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));

-    openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }

 static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernelName, const char **kernelString)
--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@@ -42,392 +42,34 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #elif defined (cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
-typedef double F;
-#else
-typedef float F;
 #endif
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
-                              __global uchar *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global uchar *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
+
+__kernel void addWeighted(__global T * src1, int src1_step1, int src1_offset1,
+                              __global T * src2, int src2_step1, int src2_offset1,
+                              __global T * dst, int dst_step1, int dst_offset1,
+                              WT alpha, WT beta, WT gama,
+                              int cols1, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if (x < cols && y < rows)
-
+    if (x < cols1 && y < rows)
    {
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index = mad24(y, dst_step1, x + dst_offset1);

-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data ,src2_data;
-
-        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
-
-        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-//        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
-        short4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-        // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
+        dst[dst_index] = convertToT(src1[src1_index]*alpha + src2[src2_index]*beta + gama);
    }
-
 }
-
-
-
-__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
-                              __global ushort *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global ushort *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-
-}
-
-
-__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset,
-                              __global short *src2,  int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global short *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        short4 tmp_data = convert_short4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
-                              __global int *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global int *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#define bitOfInt  (sizeof(int)== 4 ? 2: 3)
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> bitOfInt) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
-        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
-        // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-        float4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        int4 tmp_data = convert_int4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
-                              __global float *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global float *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-
-        // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
-        float4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        // float4 tmp_data = convert_float4(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
-                              __global double *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global double *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index_fix));
-        double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
-        double4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 16 >= dst_start) && (dst_index + 16 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 24 >= dst_start) && (dst_index + 24 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-#endif