remove redundant OPENCL_DIR flag

remove as much warnings as possible use enum instead of MACRO for ocl.hpp add command line parser in accuracy test and perf test some bug fix for arthim functions
2012-10-22 15:14:22 +08:00
parent b6a2717c2b
commit 5df77a841e
44 changed files with 2040 additions and 1593 deletions
--- a/modules/ocl/src/kernels/arithm_addWeighted.cl
+++ b/modules/ocl/src/kernels/arithm_addWeighted.cl
@@ -61,30 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
    int y = get_global_id(1);

    if (x < cols && y < rows)
-
+    
    {

        x = x << 2;
-
        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-        uchar4 src1_data ,src2_data;
+		uchar4 src1_data ,src2_data;

-        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
+		src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
+		src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
+		src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
+		src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;

-        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
+		src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
+		src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
+		src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
+		src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
 //        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
@@ -118,21 +117,35 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
    int y = get_global_id(1);

    if (x < cols && y < rows)
-
+    
    {

        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
+    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
+    if(src1_index < 0)
+    {
+        ushort4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+    }
+    if(src2_index < 0)
+    {
+        ushort4 tmp;
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+    }

-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));

        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
@@ -164,22 +177,36 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
    int y = get_global_id(1);

    if (x < cols && y < rows)
-
+    
    {

        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));

-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));

+    if(src1_index < 0)
+    {
+        short4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+    }
+    if(src2_index < 0)
+    {
+        short4 tmp;
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+    }
        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
         int4 tmp;
@@ -209,24 +236,39 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
    int y = get_global_id(1);

    if (x < cols && y < rows)
-
+    
    {
-
+            
        x = x << 2;

        #define bitOfInt  (sizeof(int)== 4 ? 2: 3)

        #define dst_align ((dst_offset >> bitOfInt) & 3)

-        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
-        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
-
+        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); 
+        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); 
+       
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));

-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
+        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
+        
+    if(src1_index < 0)
+    {
+        int4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+    }
+    if(src2_index < 0)
+    {
+        int4 tmp;
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+    }
        int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
       // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
         float4 tmp;
@@ -257,23 +299,37 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
    int y = get_global_id(1);

    if (x < cols && y < rows)
-
+    
    {
-
+            
        x = x << 2;

        #define dst_align ((dst_offset >> 2) & 3)

-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+       
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));

-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
+    if(src1_index < 0)
+    {
+        float4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+    }
+    if(src2_index < 0)
+    {
+        float4 tmp;
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+    }
    //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;

       // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
@@ -305,23 +361,37 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
    int y = get_global_id(1);

    if (x < cols && y < rows)
-
+    
    {
-
+            
        x = x << 2;

        #define dst_align ((dst_offset >> 3) & 3)

-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+       
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));

-        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index));
-        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index));
+    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index_fix));
+        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index_fix));
        double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
+    if(src1_index < 0)
+    {
+        double4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+    }
+    if(src2_index < 0)
+    {
+        double4 tmp;
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+    }
      //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
         double4 tmp_data;
        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
--- a/modules/ocl/src/kernels/arithm_bitwise_and.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_and.cl
@@ -63,15 +63,29 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+     uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+     uchar4 src2_data = vload4(0, src2 + src2_index_fix);

-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
+     if(src1_index < 0)
+     {     
+        uchar4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+     }                 
+     if(src2_index < 0)  
+     {                         
+        uchar4 tmp;                   
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+     }

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data & src2_data;
@@ -99,16 +113,30 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
+     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+     char4 src1_data = vload4(0, src1 + src1_index_fix);
+     char4 src2_data = vload4(0, src2 + src2_index_fix);

+     if(src1_index < 0)
+     {     
+        char4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+     }                 
+     if(src2_index < 0)  
+     {                         
+        char4 tmp;                   
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+     }
        char4 dst_data = *((__global char4 *)(dst + dst_index));
        char4 tmp_data = src1_data & src2_data;

@@ -136,16 +164,30 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);

-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));

+     if(src1_index < 0)
+     {     
+        ushort4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+     }                 
+     if(src2_index < 0)  
+     {                         
+        ushort4 tmp;                   
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+     }
        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
        ushort4 tmp_data = src1_data & src2_data;

@@ -174,16 +216,30 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);

-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));

+     if(src1_index < 0)
+     {     
+        short4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+     }                 
+     if(src2_index < 0)  
+     {                         
+        short4 tmp;                   
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+     }
        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
        short4 tmp_data = src1_data & src2_data;

--- a/modules/ocl/src/kernels/arithm_bitwise_not.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_not.cl
@@ -62,17 +62,24 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
+    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = ~ src1_data;
-
+        
+  /*  if(src1_index < 0)
+    {
+      uchar4 tmp;
+      tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+      src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+    }
+  */
        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
@@ -95,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -129,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -164,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -238,12 +245,12 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
    {
        int src_index = mad24(y, src_step, (x << 3) + src_offset);
        int dst_index = mad24(y, dst_step,  (x << 3) + dst_offset);
-
+         
        char8 data;

        data = *((__global char8 *)((__global char *)src + src_index));
        data = ~ data;
-
+        
        *((__global char8 *)((__global char *)dst + dst_index)) = data;
    }
 }
--- a/modules/ocl/src/kernels/arithm_bitwise_or.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_or.cl
@@ -63,16 +63,28 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-
+      int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+      int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+      if(src1_index < 0)
+      {
+        uchar4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+      }
+      if(src2_index < 0)
+      {
+        uchar4 tmp;
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+      }
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data | src2_data;

@@ -99,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -136,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -174,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
--- a/modules/ocl/src/kernels/arithm_bitwise_xor.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_xor.cl
@@ -63,16 +63,30 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
+     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);

+     if(src1_index < 0)
+     {     
+        uchar4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+     }                 
+     if(src2_index < 0)  
+     {                         
+        uchar4 tmp;                   
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+     }
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data ^ src2_data;

@@ -99,16 +113,30 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
+     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        char4 src1_data = vload4(0, src1 + src1_index_fix);
+        char4 src2_data = vload4(0, src2 + src2_index_fix);

+     if(src1_index < 0)
+     {     
+        char4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+     }                 
+     if(src2_index < 0)  
+     {                         
+        char4 tmp;                   
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+     }
        char4 dst_data = *((__global char4 *)(dst + dst_index));
        char4 tmp_data = src1_data ^ src2_data;

@@ -136,16 +164,30 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);

-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));

+     if(src1_index < 0)
+     {     
+        ushort4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+     }                 
+     if(src2_index < 0)  
+     {                         
+        ushort4 tmp;                   
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+     }
        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
        ushort4 tmp_data = src1_data ^ src2_data;

@@ -174,17 +216,35 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);

-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));

        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+
+     if(src1_index < 0)
+     {     
+        short4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+     }                 
+     if(src2_index < 0)  
+     {                         
+        short4 tmp;                   
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+     }
+
+
+
        short4 tmp_data = src1_data ^ src2_data;

        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
--- a/modules/ocl/src/kernels/arithm_compare_eq.cl
+++ b/modules/ocl/src/kernels/arithm_compare_eq.cl
@@ -63,16 +63,31 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-
+  
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));

@@ -85,7 +100,8 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
    }
 }

-__kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int src1_offset,
+
+__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
                             __global ushort *src2, int src2_step, int src2_offset,
                             __global uchar *dst,  int dst_step,  int dst_offset,
                             int rows, int cols, int dst_step1)
@@ -98,16 +114,30 @@ __kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int sr
    {
        x = x << 2;

-        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        #define dst_align ((dst_offset >> 1)& 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

+ 		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@@ -122,7 +152,6 @@ __kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int sr
 }


-
 __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset,
                             __global short *src2, int src2_step, int src2_offset,
                             __global uchar *dst,  int dst_step,  int dst_offset,
@@ -137,16 +166,32 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

+
+  
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));

@@ -170,18 +215,33 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
    int y = get_global_id(1);

    if (x < cols && y < rows)
-    {
+    {   
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;

-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+		if(src1_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));

@@ -206,15 +266,23 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));		if(src2_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));

@@ -240,15 +308,30 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+		if(src1_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));

@@ -276,16 +359,31 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-
+ 
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));

@@ -312,16 +410,31 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+  

+ 
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));

@@ -350,15 +463,30 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 
+

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@@ -384,15 +512,31 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;

-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+		if(src1_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+
+ 
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));

@@ -417,15 +561,30 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+		if(src1_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));

@@ -451,15 +610,30 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
+		if(src1_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));

@@ -487,15 +661,31 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 
+

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@@ -525,15 +715,32 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

+ 		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+  
+
+

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@@ -563,15 +770,31 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 
+

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@@ -598,16 +821,31 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
        x = x << 2;

        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+
        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+		if(src1_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+       uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));

        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -632,15 +870,31 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+  		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+		if(src1_index < 0)
+		{
+
+			float4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));

@@ -667,16 +921,28 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
+		if(src1_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));

        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
--- a/modules/ocl/src/kernels/arithm_compare_ne.cl
+++ b/modules/ocl/src/kernels/arithm_compare_ne.cl
@@ -59,15 +59,29 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -97,15 +111,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

+ 		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -135,15 +163,29 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -169,15 +211,31 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+	
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;

        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+		if(src1_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));

@@ -202,16 +260,29 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));		if(src1_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 
+       uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));

        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -236,15 +307,30 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
+		if(src1_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));

@@ -258,7 +344,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
 }
 #endif

-
+   
 /***********************************Compare LT*******************************/
 __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
                             __global uchar *src2, int src2_step, int src2_offset,
@@ -273,15 +359,29 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -311,15 +411,30 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+  

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -349,15 +464,30 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 
+

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -383,15 +513,34 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);

+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+
        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+		if(src1_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+
+
+ 
+   
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));

@@ -416,16 +565,31 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+		if(src1_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+       uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));

        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -450,16 +614,31 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
+		if(src1_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+       uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));

        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -486,15 +665,30 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
        x = x << 2;

        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+		if(src1_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			uchar4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@@ -524,15 +718,30 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+  
+

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@@ -562,15 +771,30 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
        x = x << 2;

        #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
+		if(src1_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 
+

        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@@ -596,15 +820,30 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;

        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+		if(src1_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			int4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));

@@ -629,15 +868,29 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));		
+		if(src1_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			float4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		

-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));

@@ -663,15 +916,30 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
    {
        x = x << 2;
        #define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 

        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
+		if(src1_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+		}
+		if(src2_index < 0)
+		{
+			double4 tmp;
+			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+		}		
+ 

-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));

--- a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl
+++ b/modules/ocl/src/kernels/arithm_magnitudeSqr.cl
@@ -60,23 +60,36 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
    int y = get_global_id(1);

    if (x < cols && y < rows)
-
+    

    {
-
+            
        x = x << 2;

        #define dst_align ((dst_offset >> 2) & 3)

-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+       
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-
-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+    if(src1_index < 0)
+    {
+        float4 tmp;
+        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+    }
+    if(src2_index < 0)
+    {
+        float4 tmp;
+        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+    }
        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));

        float4   tmp_data  ;
@@ -112,21 +125,32 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
    int y = get_global_id(1);

    if (x < cols && y < rows)
-
+    

    {
-
+            
        x = x << 2;

        #define dst_align ((dst_offset >> 2) & 3)

-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+       
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
+    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+
+        float8 src1_data = vload8(0, (__global float  *)((__global char *)src1 + src1_index_fix));
+
+    if(src1_index==-6)
+          src1_data.s01234567 = src1_data.s67012345;
+    if(src1_index==-4)
+          src1_data.s01234567 = src1_data.s45670123;
+    if(src1_index== -2)
+          src1_data.s01234567 = src1_data.s23456701;
+        
+    

-        float8 src1_data = vload8(0, (__global float  *)((__global char *)src1 + src1_index));
        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));

        float4   tmp_data  ;
--- a/modules/ocl/src/kernels/imgproc_remap.cl
+++ b/modules/ocl/src/kernels/imgproc_remap.cl
--- a/modules/ocl/src/kernels/split_mat.cl
+++ b/modules/ocl/src/kernels/split_mat.cl
@@ -51,9 +51,9 @@
 ////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
 ////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global uchar *mat_dst2, int dst2_step, int dst2_offset,  
                                  __global uchar *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int s
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 2;

-        int src_idx  = mad24(y, src_step, src_offset + (x << 2));
+        int src_idx  = mad24(y, src_step, src_offset + (x << 2)); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;

-        int dst2_start = mad24(y, dst2_step, dst2_offset);
+        int dst2_start = mad24(y, dst2_step, dst2_offset); 
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;

-        int dst3_start = mad24(y, dst3_step, dst3_offset);
+        int dst3_start = mad24(y, dst3_step, dst3_offset); 
        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
        int dst3_idx   = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
+           
+        uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx))); 
+        uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8  >= 0 ? src_idx - 8  : src_idx))); 
+        uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4  >= 0 ? src_idx - 4  : src_idx))); 
+        uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 )); 

-        uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
-        uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8  >= 0 ? src_idx - 8  : src_idx)));
-        uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4  >= 0 ? src_idx - 4  : src_idx)));
-        uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
-
-        int total_bytes = src_offset + rows * src_step;
-        uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4  < total_bytes ? src_idx + 4  : src_idx)));
-        uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8  < total_bytes ? src_idx + 8  : src_idx)));
-        uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
+        int total_bytes = src_offset + rows * src_step; 
+        uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4  < total_bytes ? src_idx + 4  : src_idx))); 
+        uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8  < total_bytes ? src_idx + 8  : src_idx))); 
+        uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));  

        uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;

@@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global uchar *mat_dst2, int dst2_step, int dst2_offset,  
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 2;

-        int src_idx  = mad24(y, src_step, src_offset);
+        int src_idx  = mad24(y, src_step, src_offset); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset);
+        int dst2_start = mad24(y, dst2_step, dst2_offset); 
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
+           
        uchar4 dst0_data  = *((__global uchar4 *)(mat_dst0 + dst0_idx));
        uchar4 dst1_data  = *((__global uchar4 *)(mat_dst1 + dst1_idx));
        uchar4 dst2_data  = *((__global uchar4 *)(mat_dst2 + dst2_idx));
@@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int s

        uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
        int index = 3 - dst0_offset & 3;
-        tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
+        tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]); 

        uchar4 data0, data1, data2;
-
+        
        data0     = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
        data1     = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
        data2     = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -263,33 +263,47 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C2_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 2;

        #define dst0_align ((dst0_offset & 3) << 1)
        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1)); 
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1)); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
+           
+		int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+		int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
        uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
        uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
+        if(src_idx_0 == -6)
+            src_data_0.s01234567 = src_data_0.s67012345;
+        if(src_idx_0 == -4)
+            src_data_0.s01234567 = src_data_0.s45670123;
+        if(src_idx_0 == -2)
+            src_data_0.s01234567 = src_data_0.s23456701;
+        if(src_idx_1 == -6)
+            src_data_1.s01234567 = src_data_1.s67012345;
+        if(src_idx_1 == -4)
+            src_data_1.s01234567 = src_data_1.s45670123;
+        if(src_idx_1 == -2)
+            src_data_1.s01234567 = src_data_1.s23456701;

        uchar4 dst0_data  = *((__global uchar4 *)(mat_dst0 + dst0_idx));
        uchar4 dst1_data  = *((__global uchar4 *)(mat_dst1 + dst1_idx));
@@ -312,9 +326,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global char *mat_dst2, int dst2_step, int dst2_offset,  
                                  __global char *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -322,35 +336,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int sr
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 2;

-        int src_idx  = mad24(y, src_step, src_offset + (x << 2));
+        int src_idx  = mad24(y, src_step, src_offset + (x << 2)); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset);
+        int dst2_start = mad24(y, dst2_step, dst2_offset); 
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);

-        int dst3_start = mad24(y, dst3_step, dst3_offset);
+        int dst3_start = mad24(y, dst3_step, dst3_offset); 
        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
        int dst3_idx   = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
-
-        char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
-        char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
-        char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
-        char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
-        char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
-        char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
-        char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
+           
+        char4 data_0 = *((global char4 *)(mat_src + src_idx - 12)); 
+        char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 )); 
+        char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 )); 
+        char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 )); 
+        char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 )); 
+        char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 )); 
+        char4 data_6 = *((global char4 *)(mat_src + src_idx + 12)); 

        char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;

@@ -423,33 +437,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int sr
 }

 __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global char *mat_dst2, int dst2_step, int dst2_offset,  
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 2;

-        int src_idx  = mad24(y, src_step, src_offset);
+        int src_idx  = mad24(y, src_step, src_offset); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset);
+        int dst2_start = mad24(y, dst2_step, dst2_offset); 
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
+           
        char4 dst0_data  = *((__global char4 *)(mat_dst0 + dst0_idx));
        char4 dst1_data  = *((__global char4 *)(mat_dst1 + dst1_idx));
        char4 dst2_data  = *((__global char4 *)(mat_dst2 + dst2_idx));
@@ -486,10 +500,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int sr

        char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
        int index = 3 - dst0_offset & 3;
-        tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
+        tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]); 

        char4 data0, data1, data2;
-
+        
        data0     = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
        data1     = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
        data2     = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -522,34 +536,46 @@ __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int sr
 }

 __kernel void split_vector_C2_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 2;

        #define dst0_align ((dst0_offset & 3) << 1)
        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1)); 
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1)); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
+   	int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+		int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
        char8 src_data_0 = vload8(0, mat_src + src_idx_0);
        char8 src_data_1 = vload8(0, mat_src + src_idx_1);
-
+        if(src_idx_0 == -6)
+            src_data_0.s01234567 = src_data_0.s67012345;
+        if(src_idx_0 == -4)
+            src_data_0.s01234567 = src_data_0.s45670123;
+        if(src_idx_0 == -2)
+            src_data_0.s01234567 = src_data_0.s23456701;
+        if(src_idx_1 == -6)
+            src_data_1.s01234567 = src_data_1.s67012345;
+        if(src_idx_1 == -4)
+            src_data_1.s01234567 = src_data_1.s45670123;
+        if(src_idx_1 == -2)
+            src_data_1.s01234567 = src_data_1.s23456701;
        char4 dst0_data  = *((__global char4 *)(mat_dst0 + dst0_idx));
        char4 dst1_data  = *((__global char4 *)(mat_dst1 + dst1_idx));

@@ -571,9 +597,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src,  int src_step,  int sr
 }

 __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global ushort *mat_dst2, int dst2_step, int dst2_offset,  
                                  __global ushort *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -581,30 +607,37 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 1;

-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);
+        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8); 
+        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset);
+        int dst2_start = mad24(y, dst2_step, dst2_offset); 
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);

-        int dst3_start = mad24(y, dst3_step, dst3_offset);
+        int dst3_start = mad24(y, dst3_step, dst3_offset); 
        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
        int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort8 src_data0 = vload8(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
+           
+   	int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
+             if(src_idx_0 == -6)
+            src_data0.s01234567 = src_data0.s67012345;
+        if(src_idx_0 == -4)
+            src_data0.s01234567 = src_data0.s45670123;
+        if(src_idx_0 == -2)
+            src_data0.s01234567 = src_data0.s23456701;
        ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));

        ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
@@ -639,33 +672,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int
 }

 __kernel void split_vector_C3_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global ushort *mat_dst2, int dst2_step, int dst2_offset,  
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 1;

-        int src_idx  = mad24(y, src_step, src_offset);
+        int src_idx  = mad24(y, src_step, src_offset); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset);
+        int dst2_start = mad24(y, dst2_step, dst2_offset); 
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
+           
        ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
        ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
        ushort2 dst2_data  = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -702,34 +735,48 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src,  int src_step,  int
 }

 __kernel void split_vector_C2_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 1;

        #define dst0_align ((dst0_offset & 3) << 1)
        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2)); 
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2)); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
-        ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_1));
-
+           
+		int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+		int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+        ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
+        ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
+		if(src_idx_0 < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+			src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
+		}
+		if(src_idx_1 < 0)
+		{
+			ushort4 tmp;
+			tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
+			src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
+		}		
+  
        ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
        ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));

@@ -746,9 +793,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src,  int src_step,  int
    }
 }
 __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global short *mat_dst2, int dst2_step, int dst2_offset,  
                                  __global short *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -756,30 +803,38 @@ __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int s
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 1;

-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);
+        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8); 
+        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset);
+        int dst2_start = mad24(y, dst2_step, dst2_offset); 
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);

-        int dst3_start = mad24(y, dst3_step, dst3_offset);
+        int dst3_start = mad24(y, dst3_step, dst3_offset); 
        int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
        int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-
-        short8 src_data0 = vload8(0, (__global short *)((__global char *)mat_src + src_idx_0));
+     	int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
+ 
+        if(src_idx_0 == -6)
+            src_data0.s01234567 = src_data0.s67012345;
+        if(src_idx_0 == -4)
+            src_data0.s01234567 = src_data0.s45670123;
+        if(src_idx_0 == -2)
+            src_data0.s01234567 = src_data0.s23456701;
+          
        short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));

        short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
@@ -813,33 +868,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int s
    }
 }
 __kernel void split_vector_C3_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global short *mat_dst2, int dst2_step, int dst2_offset,  
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 1;

-        int src_idx  = mad24(y, src_step, src_offset);
+        int src_idx  = mad24(y, src_step, src_offset); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);

-        int dst2_start = mad24(y, dst2_step, dst2_offset);
+        int dst2_start = mad24(y, dst2_step, dst2_offset); 
        int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
        int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
+           
        short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
        short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
        short2 dst2_data  = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -877,33 +932,47 @@ __kernel void split_vector_C3_D3 (__global short *mat_src,  int src_step,  int s


 __kernel void split_vector_C2_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
        x = x << 1;

        #define dst0_align ((dst0_offset & 3) << 1)
        #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2)); 
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2)); 

-        int dst0_start = mad24(y, dst0_step, dst0_offset);
+        int dst0_start = mad24(y, dst0_step, dst0_offset); 
        int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
        int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);

-        int dst1_start = mad24(y, dst1_step, dst1_offset);
+        int dst1_start = mad24(y, dst1_step, dst1_offset); 
        int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
        int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
+ 		int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+		int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
        short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
        short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
+		if(src_idx_0 < 0)
+		{
+			short4 tmp;
+			tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+			src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+		}
+		if(src_idx_1< 0)
+		{
+			short4 tmp;
+			tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
+			src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
+		}		
+             

        short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
        short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
@@ -921,9 +990,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src,  int src_step,  int s
    }
 }
 __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global int *mat_dst2, int dst2_step, int dst2_offset,  
                                  __global int *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -931,14 +1000,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
-        int src_idx  = mad24(y, src_step,  src_offset);
+        int src_idx  = mad24(y, src_step,  src_offset); 
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
        int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
+           
        int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];

        ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -948,18 +1017,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src
    }
 }
 __kernel void split_vector_C3_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global int *mat_dst2, int dst2_step, int dst2_offset,  
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
-        int src_idx  = mad24(y, src_step,  src_offset);
+        int src_idx  = mad24(y, src_step,  src_offset); 
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -975,20 +1044,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src,  int src_step,  int src
 }

 __kernel void split_vector_C2_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
-        int src_idx  = mad24(y, src_step,  src_offset);
+        int src_idx  = mad24(y, src_step,  src_offset); 
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
+           
        int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];

        ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -997,9 +1066,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src,  int src_step,  int src
 }

 __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global float *mat_dst2, int dst2_step, int dst2_offset,  
                                  __global float *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -1007,14 +1076,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int s
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
-        int src_idx  = mad24(y, src_step,  src_offset);
+        int src_idx  = mad24(y, src_step,  src_offset); 
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
        int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
+           
        float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];

        ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1025,18 +1094,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C3_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global float *mat_dst2, int dst2_step, int dst2_offset,  
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
-        int src_idx  = mad24(y, src_step,  src_offset);
+        int src_idx  = mad24(y, src_step,  src_offset); 
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1052,20 +1121,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src,  int src_step,  int s
 }

 __kernel void split_vector_C2_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
-        int src_idx  = mad24(y, src_step,  src_offset);
+        int src_idx  = mad24(y, src_step,  src_offset); 
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
+           
        float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];

        ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1075,9 +1144,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src,  int src_step,  int s

 #if defined (DOUBLE_SUPPORT)
 __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global double *mat_dst2, int dst2_step, int dst2_offset,  
                                  __global double *mat_dst3, int dst3_step, int dst3_offset,
                                  int rows, int cols, int dst_step1)

@@ -1085,14 +1154,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
-        int src_idx  = mad24(y, src_step,  src_offset);
+        int src_idx  = mad24(y, src_step,  src_offset); 
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
        int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
+           
        double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];

        ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1103,18 +1172,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int
 }

 __kernel void split_vector_C3_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
-                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
+	                                __global double *mat_dst2, int dst2_step, int dst2_offset,  
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
-        int src_idx  = mad24(y, src_step,  src_offset);
+        int src_idx  = mad24(y, src_step,  src_offset); 
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
        int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1130,20 +1199,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src,  int src_step,  int
 }

 __kernel void split_vector_C2_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
                                  int rows, int cols, int dst_step1)

 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if((x  < cols) && (y < rows))
+    if((x  < cols) && (y < rows)) 
    {
-        int src_idx  = mad24(y, src_step,  src_offset);
+        int src_idx  = mad24(y, src_step,  src_offset); 
        int dst0_idx = mad24(y, dst0_step, dst0_offset);
        int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
+           
        double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];

        ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;